Here's that output from the script. The diffstat can be a guiding list
of files to be converted. (Kconfig and Makefiles can be ignored)
arch/i386/Kconfig | 718 +++++----------------
arch/i386/Makefile | 128 ---
arch/i386/kernel/Makefile | 79 +-
arch/i386/kernel/acpi/Makefile | 15
arch/i386/kernel/acpi/boot.c | 26
arch/i386/kernel/apic.c | 1201 ------------------------------------
arch/i386/kernel/cpu/Makefile | 34 -
arch/i386/kernel/cpu/common.c | 58 -
arch/i386/kernel/cpu/mtrr/Makefile | 19
arch/i386/kernel/cpu/mtrr/main.c | 629 +-----------------
arch/i386/kernel/entry.S | 358 ++++++----
arch/i386/kernel/head.S | 457 +------------
arch/i386/kernel/i386_ksyms.c | 2
arch/i386/kernel/io_apic.c | 104 ++-
arch/i386/kernel/ioport.c | 74 --
arch/i386/kernel/irq.c | 66 +
arch/i386/kernel/ldt.c | 32
arch/i386/kernel/microcode.c | 375 -----------
arch/i386/kernel/mpparse.c | 27
arch/i386/kernel/pci-dma.c | 141 ++++
arch/i386/kernel/process.c | 291 +++-----
arch/i386/kernel/quirks.c | 11
arch/i386/kernel/setup.c | 241 ++++++-
arch/i386/kernel/signal.c | 2
arch/i386/kernel/smp.c | 208 +++---
arch/i386/kernel/smpboot.c | 476 ++++++++++++--
arch/i386/kernel/time.c | 553 +++++++++++++++-
arch/i386/kernel/timers/Makefile | 16
arch/i386/kernel/timers/timer_tsc.c | 277 +-------
arch/i386/kernel/traps.c | 210 ++----
arch/i386/kernel/vsyscall.S | 4
arch/i386/mach-default/Makefile | 9
arch/i386/mm/Makefile | 22
arch/i386/mm/fault.c | 35 -
arch/i386/mm/highmem.c | 15
arch/i386/mm/hypervisor.c | 363 ++++++++++
arch/i386/mm/init.c | 131 +++
arch/i386/mm/ioremap.c | 312 ++++++---
arch/i386/mm/pgtable.c | 309 ++++++++-
arch/i386/pci/Makefile | 38 -
arch/i386/pci/irq.c | 5
41 files changed, 3673 insertions(+), 4398 deletions(-)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/Kconfig linux-2.6-xen-sparse/arch/i386/Kconfig
--- pristine-linux-2.6.12/arch/i386/Kconfig 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/Kconfig 2005-07-28 13:17:07.000000000
-0700
@@ -3,7 +3,11 @@
# see Documentation/kbuild/kconfig-language.txt.
#
-mainmenu "Linux Kernel Configuration"
+menu "X86 Processor Configuration"
+
+config XENARCH
+ string
+ default i386
config X86
bool
@@ -33,119 +37,6 @@ config GENERIC_IOMAP
bool
default y
-source "init/Kconfig"
-
-menu "Processor type and features"
-
-choice
- prompt "Subarchitecture Type"
- default X86_PC
-
-config X86_PC
- bool "PC-compatible"
- help
- Choose this option if your computer is a standard PC or compatible.
-
-config X86_ELAN
- bool "AMD Elan"
- help
- Select this for an AMD Elan processor.
-
- Do not use this option for K6/Athlon/Opteron processors!
-
- If unsure, choose "PC-compatible" instead.
-
-config X86_VOYAGER
- bool "Voyager (NCR)"
- help
- Voyager is an MCA-based 32-way capable SMP architecture proprietary
- to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
-
- *** WARNING ***
-
- If you do not specifically know you have a Voyager based machine,
- say N here, otherwise the kernel you build will not be bootable.
-
-config X86_NUMAQ
- bool "NUMAQ (IBM/Sequent)"
- select DISCONTIGMEM
- select NUMA
- help
- This option is used for getting Linux to run on a (IBM/Sequent) NUMA
- multiquad box. This changes the way that processors are bootstrapped,
- and uses Clustered Logical APIC addressing mode instead of Flat
Logical.
- You will need a new lynxer.elf file to flash your firmware with - send
- email to <Martin.Bligh@xxxxxxxxxx>.
-
-config X86_SUMMIT
- bool "Summit/EXA (IBM x440)"
- depends on SMP
- help
- This option is needed for IBM systems that use the Summit/EXA chipset.
- In particular, it is needed for the x440.
-
- If you don't have one of these computers, you should say N here.
-
-config X86_BIGSMP
- bool "Support for other sub-arch SMP systems with more than 8 CPUs"
- depends on SMP
- help
- This option is needed for the systems that have more than 8 CPUs
- and if the system is not of any sub-arch type above.
-
- If you don't have such a system, you should say N here.
-
-config X86_VISWS
- bool "SGI 320/540 (Visual Workstation)"
- help
- The SGI Visual Workstation series is an IA32-based workstation
- based on SGI systems chips with some legacy PC hardware attached.
-
- Say Y here to create a kernel to run on the SGI 320 or 540.
-
- A kernel compiled for the Visual Workstation will not run on PCs
- and vice versa. See <file:Documentation/sgi-visws.txt> for details.
-
-config X86_GENERICARCH
- bool "Generic architecture (Summit, bigsmp, ES7000, default)"
- depends on SMP
- help
- This option compiles in the Summit, bigsmp, ES7000, default
subarchitectures.
- It is intended for a generic binary kernel.
-
-config X86_ES7000
- bool "Support for Unisys ES7000 IA32 series"
- depends on SMP
- help
- Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
- supposed to run on an IA32-based Unisys ES7000 system.
- Only choose this option if you have such a system, otherwise you
- should say N here.
-
-endchoice
-
-config ACPI_SRAT
- bool
- default y
- depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
-
-config X86_SUMMIT_NUMA
- bool
- default y
- depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
-
-config X86_CYCLONE_TIMER
- bool
- default y
- depends on X86_SUMMIT || X86_GENERICARCH
-
-config ES7000_CLUSTERED_APIC
- bool
- default y
- depends on SMP && X86_ES7000 && MPENTIUMIII
-
-if !X86_ELAN
-
choice
prompt "Processor family"
default M686
@@ -347,8 +238,6 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
-endif
-
#
# Define implied options from the CPU selection here
#
@@ -444,19 +333,21 @@ config X86_OOSTORE
default y
config HPET_TIMER
- bool "HPET Timer Support"
- help
- This enables the use of the HPET for the kernel's internal timer.
- HPET is the next generation timer replacing legacy 8254s.
- You can safely choose Y here. However, HPET will only be
- activated if the platform and the BIOS support this feature.
- Otherwise the 8254 will be used for timing services.
-
- Choose N to continue using the legacy 8254 timer.
+ bool
+ default n
+#config HPET_TIMER
+# bool "HPET Timer Support"
+# help
+# This enables the use of the HPET for the kernel's internal timer.
+# HPET is the next generation timer replacing legacy 8254s.
+# You can safely choose Y here. However, HPET will only be
+# activated if the platform and the BIOS support this feature.
+# Otherwise the 8254 will be used for timing services.
+#
+# Choose N to continue using the legacy 8254 timer.
config HPET_EMULATE_RTC
- bool "Provide RTC interrupt"
- depends on HPET_TIMER && RTC=y
+ def_bool HPET_TIMER && RTC=y
config SMP
bool "Symmetric multi-processing support"
@@ -487,6 +378,19 @@ config SMP
If you don't know what to do here, say N.
+config SMP_ALTERNATIVES
+ bool "SMP alternatives support (EXPERIMENTAL)"
+ depends on SMP && EXPERIMENTAL
+ help
+ Try to reduce the overhead of running an SMP kernel on a uniprocessor
+ host slightly by replacing certain key instruction sequences
+ according to whether we currently have more than one CPU available.
+ This should provide a noticeable boost to performance when
+ running SMP kernels on UP machines, and have negligible impact
+ when running on an true SMP host.
+
+ If unsure, say N.
+
config NR_CPUS
int "Maximum number of CPUs (2-255)"
range 2 255
@@ -534,122 +438,47 @@ config PREEMPT_BKL
Say Y here if you are building a kernel for a desktop system.
Say N if you are unsure.
-config X86_UP_APIC
- bool "Local APIC support on uniprocessors"
- depends on !SMP && !(X86_VISWS || X86_VOYAGER)
- help
- A local APIC (Advanced Programmable Interrupt Controller) is an
- integrated interrupt controller in the CPU. If you have a single-CPU
- system which has a processor with a local APIC, you can say Y here to
- enable and use it. If you say Y here even though your machine doesn't
- have a local APIC, then the kernel will still run with no slowdown at
- all. The local APIC supports CPU-generated self-interrupts (timer,
- performance counters), and the NMI watchdog which detects hard
- lockups.
-
-config X86_UP_IOAPIC
- bool "IO-APIC support on uniprocessors"
- depends on X86_UP_APIC
- help
- An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
- SMP-capable replacement for PC-style interrupt controllers. Most
- SMP systems and many recent uniprocessor systems have one.
-
- If you have a single-CPU system with an IO-APIC, you can say Y here
- to use it. If you say Y here even though your machine doesn't have
- an IO-APIC, then the kernel will still run with no slowdown at all.
-
-config X86_LOCAL_APIC
- bool
- depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
- default y
-
-config X86_IO_APIC
- bool
- depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
- default y
-
-config X86_VISWS_APIC
- bool
- depends on X86_VISWS
- default y
-
-config X86_TSC
- bool
- depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON ||
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ
- default y
-
-config X86_MCE
- bool "Machine Check Exception"
- depends on !X86_VOYAGER
- ---help---
- Machine Check Exception support allows the processor to notify the
- kernel if it detects a problem (e.g. overheating, component failure).
- The action the kernel takes depends on the severity of the problem,
- ranging from a warning message on the console, to halting the machine.
- Your processor must be a Pentium or newer to support this - check the
- flags in /proc/cpuinfo for mce. Note that some older Pentium systems
- have a design flaw which leads to false MCE events - hence MCE is
- disabled on all P5 processors, unless explicitly enabled with "mce"
- as a boot argument. Similarly, if MCE is built in and creates a
- problem on some new non-standard machine, you can boot with "nomce"
- to disable it. MCE support simply ignores non-MCE processors like
- the 386 and 486, so nearly everyone can say Y here.
-
-config X86_MCE_NONFATAL
- tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel
Pentium 4"
- depends on X86_MCE
- help
- Enabling this feature starts a timer that triggers every 5 seconds
which
- will look at the machine check registers to see if anything happened.
- Non-fatal problems automatically get corrected (but still logged).
- Disable this if you don't want to see these messages.
- Seeing the messages this option prints out may be indicative of dying
hardware,
- or out-of-spec (ie, overclocked) hardware.
- This option only does something on certain CPUs.
- (AMD Athlon/Duron and Intel Pentium 4)
-
-config X86_MCE_P4THERMAL
- bool "check for P4 thermal throttling interrupt."
- depends on X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS
- help
- Enabling this feature will cause a message to be printed when the P4
- enters thermal throttling.
-
-config TOSHIBA
- tristate "Toshiba Laptop support"
- ---help---
- This adds a driver to safely access the System Management Mode of
- the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
- not work on models with a Phoenix BIOS. The System Management Mode
- is used to set the BIOS and power saving options on Toshiba portables.
-
- For information on utilities to make use of this driver see the
- Toshiba Linux utilities web site at:
- <http://www.buzzard.org.uk/toshiba/>.
-
- Say Y if you intend to run this kernel on a Toshiba portable.
- Say N otherwise.
-
-config I8K
- tristate "Dell laptop support"
- ---help---
- This adds a driver to safely access the System Management Mode
- of the CPU on the Dell Inspiron 8000. The System Management Mode
- is used to read cpu temperature and cooling fan status and to
- control the fans on the I8K portables.
-
- This driver has been tested only on the Inspiron 8000 but it may
- also work with other Dell laptops. You can force loading on other
- models by passing the parameter `force=1' to the module. Use at
- your own risk.
-
- For information on utilities to make use of this driver see the
- I8K Linux utilities web site at:
- <http://people.debian.org/~dz/i8k/>
-
- Say Y if you intend to run this kernel on a Dell Inspiron 8000.
- Say N otherwise.
+#config X86_TSC
+# bool
+# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON ||
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ
+# default y
+
+#config X86_MCE
+# bool "Machine Check Exception"
+# depends on !X86_VOYAGER
+# ---help---
+# Machine Check Exception support allows the processor to notify the
+# kernel if it detects a problem (e.g. overheating, component failure).
+# The action the kernel takes depends on the severity of the problem,
+# ranging from a warning message on the console, to halting the
machine.
+# Your processor must be a Pentium or newer to support this - check the
+# flags in /proc/cpuinfo for mce. Note that some older Pentium systems
+# have a design flaw which leads to false MCE events - hence MCE is
+# disabled on all P5 processors, unless explicitly enabled with "mce"
+# as a boot argument. Similarly, if MCE is built in and creates a
+# problem on some new non-standard machine, you can boot with "nomce"
+# to disable it. MCE support simply ignores non-MCE processors like
+# the 386 and 486, so nearly everyone can say Y here.
+
+#config X86_MCE_NONFATAL
+# tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel
Pentium 4"
+# depends on X86_MCE
+# help
+# Enabling this feature starts a timer that triggers every 5 seconds
which
+# will look at the machine check registers to see if anything happened.
+# Non-fatal problems automatically get corrected (but still logged).
+# Disable this if you don't want to see these messages.
+# Seeing the messages this option prints out may be indicative of
dying hardware,
+# or out-of-spec (ie, overclocked) hardware.
+# This option only does something on certain CPUs.
+# (AMD Athlon/Duron and Intel Pentium 4)
+
+#config X86_MCE_P4THERMAL
+# bool "check for P4 thermal throttling interrupt."
+# depends on X86_MCE && (X86_UP_APIC || SMP)
+# help
+# Enabling this feature will cause a message to be printed when the P4
+# enters thermal throttling.
config X86_REBOOTFIXUPS
bool "Enable X86 board specific fixups for reboot"
@@ -671,6 +500,7 @@ config X86_REBOOTFIXUPS
config MICROCODE
tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+ depends on XEN_PRIVILEGED_GUEST
---help---
If you say Y here and also to "/dev file system support" in the
'File systems' section, you will be able to update the microcode on
@@ -686,14 +516,14 @@ config MICROCODE
To compile this driver as a module, choose M here: the
module will be called microcode.
-config X86_MSR
- tristate "/dev/cpu/*/msr - Model-specific register support"
- help
- This device gives privileged processes access to the x86
- Model-Specific Registers (MSRs). It is a character device with
- major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
- MSR accesses are directed to a specific CPU on multi-processor
- systems.
+#config X86_MSR
+# tristate "/dev/cpu/*/msr - Model-specific register support"
+# help
+# This device gives privileged processes access to the x86
+# Model-Specific Registers (MSRs). It is a character device with
+# major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+# MSR accesses are directed to a specific CPU on multi-processor
+# systems.
config X86_CPUID
tristate "/dev/cpu/*/cpuid - CPU information support"
@@ -803,95 +633,57 @@ config NEED_NODE_MEMMAP_SIZE
depends on DISCONTIGMEM
default y
-config HIGHPTE
- bool "Allocate 3rd-level pagetables from highmem"
- depends on HIGHMEM4G || HIGHMEM64G
- help
- The VM uses one page table entry for each page of physical memory.
- For systems with a lot of RAM, this can be wasteful of precious
- low memory. Setting this option will put user-space page table
- entries in high memory.
-
-config MATH_EMULATION
- bool "Math emulation"
- ---help---
- Linux can emulate a math coprocessor (used for floating point
- operations) if you don't have one. 486DX and Pentium processors have
- a math coprocessor built in, 486SX and 386 do not, unless you added
- a 487DX or 387, respectively. (The messages during boot time can
- give you some hints here ["man dmesg"].) Everyone needs either a
- coprocessor or this emulation.
-
- If you don't have a math coprocessor, you need to say Y here; if you
- say Y here even though you have a coprocessor, the coprocessor will
- be used nevertheless. (This behavior can be changed with the kernel
- command line option "no387", which comes handy if your coprocessor
- is broken. Try "man bootparam" or see the documentation of your boot
- loader (lilo or loadlin) about how to pass options to the kernel at
- boot time.) This means that it is a good idea to say Y here if you
- intend to use this kernel on different machines.
-
- More information about the internals of the Linux math coprocessor
- emulation can be found in <file:arch/i386/math-emu/README>.
-
- If you are not sure, say Y; apart from resulting in a 66 KB bigger
- kernel, it won't hurt.
+#config HIGHPTE
+# bool "Allocate 3rd-level pagetables from highmem"
+# depends on HIGHMEM4G || HIGHMEM64G
+# help
+# The VM uses one page table entry for each page of physical memory.
+# For systems with a lot of RAM, this can be wasteful of precious
+# low memory. Setting this option will put user-space page table
+# entries in high memory.
config MTRR
- bool "MTRR (Memory Type Range Register) support"
- ---help---
- On Intel P6 family processors (Pentium Pro, Pentium II and later)
- the Memory Type Range Registers (MTRRs) may be used to control
- processor access to memory ranges. This is most useful if you have
- a video (VGA) card on a PCI or AGP bus. Enabling write-combining
- allows bus write transfers to be combined into a larger transfer
- before bursting over the PCI/AGP bus. This can increase performance
- of image write operations 2.5 times or more. Saying Y here creates a
- /proc/mtrr file which may be used to manipulate your processor's
- MTRRs. Typically the X server should use this.
-
- This code has a reasonably generic interface so that similar
- control registers on other processors can be easily supported
- as well:
-
- The Cyrix 6x86, 6x86MX and M II processors have Address Range
- Registers (ARRs) which provide a similar functionality to MTRRs. For
- these, the ARRs are used to emulate the MTRRs.
- The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
- MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
- write-combining. All of these processors are supported by this code
- and it makes sense to say Y here if you have one of them.
-
- Saying Y here also fixes a problem with buggy SMP BIOSes which only
- set the MTRRs for the boot CPU and not for the secondary CPUs. This
- can lead to all sorts of problems, so it's good to say Y here.
-
- You can safely say Y even if your machine doesn't have MTRRs, you'll
- just add about 9 KB to your kernel.
-
- See <file:Documentation/mtrr.txt> for more information.
-
-config EFI
- bool "Boot from EFI support (EXPERIMENTAL)"
- depends on ACPI
- default n
- ---help---
- This enables the the kernel to boot on EFI platforms using
- system configuration information passed to it from the firmware.
- This also enables the kernel to use any EFI runtime services that are
- available (such as the EFI variable services).
-
- This option is only useful on systems that have EFI firmware
- and will result in a kernel image that is ~8k larger. In addition,
- you must use the latest ELILO loader available at
- <http://elilo.sourceforge.net> in order to take advantage of
- kernel initialization using EFI information (neither GRUB nor LILO know
- anything about EFI). However, even with this option, the resultant
- kernel should continue to boot on existing non-EFI platforms.
+ bool
+ depends on XEN_PRIVILEGED_GUEST
+ default y
+
+#config MTRR
+# bool "MTRR (Memory Type Range Register) support"
+# ---help---
+# On Intel P6 family processors (Pentium Pro, Pentium II and later)
+# the Memory Type Range Registers (MTRRs) may be used to control
+# processor access to memory ranges. This is most useful if you have
+# a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+# allows bus write transfers to be combined into a larger transfer
+# before bursting over the PCI/AGP bus. This can increase performance
+# of image write operations 2.5 times or more. Saying Y here creates a
+# /proc/mtrr file which may be used to manipulate your processor's
+# MTRRs. Typically the X server should use this.
+#
+# This code has a reasonably generic interface so that similar
+# control registers on other processors can be easily supported
+# as well:
+#
+# The Cyrix 6x86, 6x86MX and M II processors have Address Range
+# Registers (ARRs) which provide a similar functionality to MTRRs. For
+# these, the ARRs are used to emulate the MTRRs.
+# The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
+# MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
+# write-combining. All of these processors are supported by this code
+# and it makes sense to say Y here if you have one of them.
+#
+# Saying Y here also fixes a problem with buggy SMP BIOSes which only
+# set the MTRRs for the boot CPU and not for the secondary CPUs. This
+# can lead to all sorts of problems, so it's good to say Y here.
+#
+# You can safely say Y even if your machine doesn't have MTRRs, you'll
+# just add about 9 KB to your kernel.
+#
+# See <file:Documentation/mtrr.txt> for more information.
config IRQBALANCE
bool "Enable kernel irq balancing"
- depends on SMP && X86_IO_APIC
+ depends on SMP && X86_IO_APIC && !XEN
default y
help
The default yes will allow the kernel to do irq load balancing.
@@ -922,186 +714,59 @@ config REGPARM
generate incorrect output with certain kernel constructs when
-mregparm=3 is used.
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
+config X86_LOCAL_APIC
+ bool
+ depends on XEN_PRIVILEGED_GUEST && (X86_UP_APIC || ((X86_VISWS || SMP)
&& !X86_VOYAGER))
default y
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
-endmenu
-
-
-menu "Power management options (ACPI, APM)"
- depends on !X86_VOYAGER
-
-source kernel/power/Kconfig
-source "drivers/acpi/Kconfig"
+config X86_IO_APIC
+ bool
+ depends on XEN_PRIVILEGED_GUEST && (X86_UP_IOAPIC || (SMP &&
!(X86_VISWS || X86_VOYAGER)))
+ default y
-menu "APM (Advanced Power Management) BIOS Support"
-depends on PM && !X86_VISWS
+config X86_VISWS_APIC
+ bool
+ depends on X86_VISWS
+ default y
-config APM
- tristate "APM (Advanced Power Management) BIOS support"
- depends on PM
+config HOTPLUG_CPU
+ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+ depends on SMP && HOTPLUG && EXPERIMENTAL
---help---
- APM is a BIOS specification for saving power using several different
- techniques. This is mostly useful for battery powered laptops with
- APM compliant BIOSes. If you say Y here, the system time will be
- reset after a RESUME operation, the /proc/apm device will provide
- battery status information, and user-space programs will receive
- notification of APM "events" (e.g. battery status change).
-
- If you select "Y" here, you can disable actual use of the APM
- BIOS by passing the "apm=off" option to the kernel at boot time.
-
- Note that the APM support is almost completely disabled for
- machines with more than one CPU.
-
- In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/pm.txt> and the
- Battery Powered Linux mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>.
+ Say Y here to experiment with turning CPUs off and on. CPUs
+ can be controlled through /sys/devices/system/cpu.
- This driver does not spin down disk drives (see the hdparm(8)
- manpage ("man 8 hdparm") for that), and it doesn't turn off
- VESA-compliant "green" monitors.
-
- This driver does not support the TI 4000M TravelMate and the ACER
- 486/DX4/75 because they don't have compliant BIOSes. Many "green"
- desktop machines also don't have compliant BIOSes, and this driver
- may cause those machines to panic during the boot phase.
-
- Generally, if you don't have a battery in your machine, there isn't
- much point in using this driver and you should say N. If you get
- random kernel OOPSes or reboots that don't seem to be related to
- anything, try disabling/enabling this option (or disabling/enabling
- APM in your BIOS).
-
- Some other things you should try when experiencing seemingly random,
- "weird" problems:
-
- 1) make sure that you have enough swap space and that it is
- enabled.
- 2) pass the "no-hlt" option to the kernel
- 3) switch on floating point emulation in the kernel and pass
- the "no387" option to the kernel
- 4) pass the "floppy=nodma" option to the kernel
- 5) pass the "mem=4M" option to the kernel (thereby disabling
- all but the first 4 MB of RAM)
- 6) make sure that the CPU is not over clocked.
- 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
- 8) disable the cache from your BIOS settings
- 9) install a fan for the video card or exchange video RAM
- 10) install a better fan for the CPU
- 11) exchange RAM chips
- 12) exchange the motherboard.
+ Say N.
- To compile this driver as a module, choose M here: the
- module will be called apm.
-config APM_IGNORE_USER_SUSPEND
- bool "Ignore USER SUSPEND"
- depends on APM
- help
- This option will ignore USER SUSPEND requests. On machines with a
- compliant APM BIOS, you want to say N. However, on the NEC Versa M
- series notebooks, it is necessary to say Y because of a BIOS bug.
-
-config APM_DO_ENABLE
- bool "Enable PM at boot time"
- depends on APM
- ---help---
- Enable APM features at boot time. From page 36 of the APM BIOS
- specification: "When disabled, the APM BIOS does not automatically
- power manage devices, enter the Standby State, enter the Suspend
- State, or take power saving steps in response to CPU Idle calls."
- This driver will make CPU Idle calls when Linux is idle (unless this
- feature is turned off -- see "Do CPU IDLE calls", below). This
- should always save battery power, but more complicated APM features
- will be dependent on your BIOS implementation. You may need to turn
- this option off if your computer hangs at boot time when using APM
- support, or if it beeps continuously instead of suspending. Turn
- this off if you have a NEC UltraLite Versa 33/C or a Toshiba
- T400CDT. This is off by default since most machines do fine without
- this feature.
-
-config APM_CPU_IDLE
- bool "Make CPU Idle calls when idle"
- depends on APM
- help
- Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
- On some machines, this can activate improved power savings, such as
- a slowed CPU clock rate, when the machine is idle. These idle calls
- are made after the idle loop has run for some length of time (e.g.,
- 333 mS). On some machines, this will cause a hang at boot time or
- whenever the CPU becomes idle. (On machines with more than one CPU,
- this option does nothing.)
-
-config APM_DISPLAY_BLANK
- bool "Enable console blanking using APM"
- depends on APM
- help
- Enable console blanking using the APM. Some laptops can use this to
- turn off the LCD backlight when the screen blanker of the Linux
- virtual console blanks the screen. Note that this is only used by
- the virtual console screen blanker, and won't turn off the backlight
- when using the X Window system. This also doesn't have anything to
- do with your VESA-compliant power-saving monitor. Further, this
- option doesn't work for all laptops -- it might not turn off your
- backlight at all, or it might print a lot of errors to the console,
- especially if you are using gpm.
-
-config APM_RTC_IS_GMT
- bool "RTC stores time in GMT"
- depends on APM
- help
- Say Y here if your RTC (Real Time Clock a.k.a. hardware clock)
- stores the time in GMT (Greenwich Mean Time). Say N if your RTC
- stores localtime.
-
- It is in fact recommended to store GMT in your RTC, because then you
- don't have to worry about daylight savings time changes. The only
- reason not to use GMT in your RTC is if you also run a broken OS
- that doesn't understand GMT.
-
-config APM_ALLOW_INTS
- bool "Allow interrupts during APM BIOS calls"
- depends on APM
- help
- Normally we disable external interrupts while we are making calls to
- the APM BIOS as a measure to lessen the effects of a badly behaving
- BIOS implementation. The BIOS should reenable interrupts if it
- needs to. Unfortunately, some BIOSes do not -- especially those in
- many of the newer IBM Thinkpads. If you experience hangs when you
- suspend, try setting this to Y. Otherwise, say N.
-
-config APM_REAL_MODE_POWER_OFF
- bool "Use real mode APM BIOS call to power off"
- depends on APM
- help
- Use real mode APM BIOS calls to switch off the computer. This is
- a work-around for a number of buggy BIOSes. Switch this option on if
- your computer crashes instead of powering off properly.
+if XEN_PHYSDEV_ACCESS
-endmenu
+menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
-source "arch/i386/kernel/cpu/cpufreq/Kconfig"
+config X86_UP_APIC
+ bool "Local APIC support on uniprocessors"
+ depends on !SMP && !(X86_VISWS || X86_VOYAGER)
+ help
+ A local APIC (Advanced Programmable Interrupt Controller) is an
+ integrated interrupt controller in the CPU. If you have a single-CPU
+ system which has a processor with a local APIC, you can say Y here to
+ enable and use it. If you say Y here even though your machine doesn't
+ have a local APIC, then the kernel will still run with no slowdown at
+ all. The local APIC supports CPU-generated self-interrupts (timer,
+ performance counters), and the NMI watchdog which detects hard
+ lockups.
-endmenu
+config X86_UP_IOAPIC
+ bool "IO-APIC support on uniprocessors"
+ depends on X86_UP_APIC
+ help
+ An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
+ SMP-capable replacement for PC-style interrupt controllers. Most
+ SMP systems and many recent uniprocessor systems have one.
-menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
+ If you have a single-CPU system with an IO-APIC, you can say Y here
+ to use it. If you say Y here even though your machine doesn't have
+ an IO-APIC, then the kernel will still run with no slowdown at all.
config PCI
bool "PCI support" if !X86_VISWS
@@ -1232,25 +897,7 @@ source "drivers/pci/hotplug/Kconfig"
endmenu
-menu "Executable file formats"
-
-source "fs/Kconfig.binfmt"
-
-endmenu
-
-source "drivers/Kconfig"
-
-source "fs/Kconfig"
-
-source "arch/i386/oprofile/Kconfig"
-
-source "arch/i386/Kconfig.debug"
-
-source "security/Kconfig"
-
-source "crypto/Kconfig"
-
-source "lib/Kconfig"
+endif
#
# Use the generic interrupt handling code in kernel/irq/:
@@ -1268,10 +915,10 @@ config X86_SMP
depends on SMP && !X86_VOYAGER
default y
-config X86_HT
- bool
- depends on SMP && !(X86_VISWS || X86_VOYAGER)
- default y
+#config X86_HT
+# bool
+# depends on SMP && !(X86_VISWS || X86_VOYAGER)
+# default y
config X86_BIOS_REBOOT
bool
@@ -1287,3 +934,22 @@ config PC
bool
depends on X86 && !EMBEDDED
default y
+
+config SECCOMP
+ bool "Enable seccomp to safely compute untrusted bytecode"
+ depends on PROC_FS
+ default y
+ help
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via /proc/<pid>/seccomp, it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
+ If unsure, say Y. Only embedded should say N here.
+
+endmenu
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c
linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c
--- pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c 2005-07-28
13:17:07.000000000 -0700
@@ -36,6 +36,11 @@
#include <asm/io.h>
#include <asm/irq.h>
#include <asm/mpspec.h>
+#ifdef CONFIG_XEN
+#include <asm/fixmap.h>
+#endif
+
+void (*pm_power_off)(void) = NULL;
#ifdef CONFIG_X86_64
@@ -100,7 +105,7 @@ EXPORT_SYMBOL(x86_acpiid_to_apicid);
*/
enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
/* rely on all ACPI tables being in the direct mapping */
char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
@@ -133,8 +138,10 @@ char *__acpi_map_table(unsigned long phy
unsigned long base, offset, mapped_size;
int idx;
+#ifndef CONFIG_XEN
if (phys + size < 8*1024*1024)
return __va(phys);
+#endif
offset = phys & (PAGE_SIZE - 1);
mapped_size = PAGE_SIZE - offset;
@@ -462,18 +469,6 @@ unsigned int acpi_register_gsi(u32 gsi,
unsigned int irq;
unsigned int plat_gsi = gsi;
-#ifdef CONFIG_PCI
- /*
- * Make sure all (legacy) PCI IRQs are set as level-triggered.
- */
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- extern void eisa_set_level_irq(unsigned int irq);
-
- if (edge_level == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
- }
-#endif
-
#ifdef CONFIG_X86_IO_APIC
if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low);
@@ -513,13 +508,14 @@ acpi_scan_rsdp (
{
unsigned long offset = 0;
unsigned long sig_len = sizeof("RSD PTR ") - 1;
+ unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
/*
* Scan all 16-byte boundaries of the physical memory region for the
* RSDP signature.
*/
for (offset = 0; offset < length; offset += 16) {
- if (strncmp((char *) (start + offset), "RSD PTR ", sig_len))
+ if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len))
continue;
return (start + offset);
}
@@ -652,6 +648,8 @@ acpi_find_rsdp (void)
if (!rsdp_phys)
rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000);
+ set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys);
+
return rsdp_phys;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile
linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -1,4 +1,13 @@
-obj-$(CONFIG_ACPI_BOOT) := boot.o
-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
+obj-$(CONFIG_ACPI_BOOT) := boot.o
+c-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
+c-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+ @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@
+
+obj-y += $(c-obj-y) $(s-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/apic.c
linux-2.6-xen-sparse/arch/i386/kernel/apic.c
--- pristine-linux-2.6.12/arch/i386/kernel/apic.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/apic.c 2005-07-28
13:17:07.000000000 -0700
@@ -44,8 +44,10 @@
*/
int apic_verbosity;
-
-static void apic_pm_activate(void);
+int get_physical_broadcast(void)
+{
+ return 0xff;
+}
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
@@ -65,1212 +67,17 @@ void ack_bad_irq(unsigned int irq)
ack_APIC_irq();
}
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
- smp_intr_init();
-#endif
- /* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer = 0;
-
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
-
-static int enabled_via_apicbase;
-
-void enable_NMI_through_LVT0 (void * dummy)
-{
- unsigned int v, ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- v = APIC_DM_NMI; /* unmask and set to NMI */
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
- v |= APIC_LVT_LEVEL_TRIGGER;
- apic_write_around(APIC_LVT0, v);
-}
-
-int get_physical_broadcast(void)
-{
- unsigned int lvr, version;
- lvr = apic_read(APIC_LVR);
- version = GET_APIC_VERSION(lvr);
- if (!APIC_INTEGRATED(version) || version >= 0x14)
- return 0xff;
- else
- return 0xf;
-}
-
-int get_maxlvt(void)
-{
- unsigned int v, ver, maxlvt;
-
- v = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(v);
- /* 82489DXs do not report # of LVT entries. */
- maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
- return maxlvt;
-}
-
-void clear_local_APIC(void)
-{
- int maxlvt;
- unsigned long v;
-
- maxlvt = get_maxlvt();
-
- /*
- * Masking an LVT entry on a P6 can trigger a local APIC error
- * if the vector is zero. Mask LVTERR first to prevent this.
- */
- if (maxlvt >= 3) {
- v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
- apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
- }
- /*
- * Careful: we have to set masks only first to deassert
- * any level-triggered sources.
- */
- v = apic_read(APIC_LVTT);
- apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT0);
- apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT1);
- apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
- if (maxlvt >= 4) {
- v = apic_read(APIC_LVTPC);
- apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
- }
-
-/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5) {
- v = apic_read(APIC_LVTTHMR);
- apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
- }
-#endif
- /*
- * Clean APIC state for other OSs:
- */
- apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
- apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
- if (maxlvt >= 3)
- apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
- if (maxlvt >= 4)
- apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5)
- apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
- v = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (APIC_INTEGRATED(v)) { /* !82489DX */
- if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP.
*/
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- }
-}
-
-void __init connect_bsp_APIC(void)
-{
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e.
- * connect BSP's local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
- enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(void)
-{
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect
- * only on certain older boards). Note that APIC
- * interrupts, including IPIs, won't work beyond
- * this point! The only exception are INIT IPIs.
- */
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- }
-}
-
-void disable_local_APIC(void)
-{
- unsigned long value;
-
- clear_local_APIC();
-
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_SPIV_APIC_ENABLED;
- apic_write_around(APIC_SPIV, value);
-
- if (enabled_via_apicbase) {
- unsigned int l, h;
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_ENABLE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- }
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
-void __init sync_Arb_IDs(void)
-{
- /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (ver >= 0x14) /* P4 or higher */
- return;
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
- | APIC_DM_INIT);
-}
-
-extern void __error_in_apic_c (void);
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
- unsigned long value, ver;
-
- /*
- * Don't do the setup now if we have a SMP BIOS as the
- * through-I/O-APIC virtual wire mode might be active.
- */
- if (smp_found_config || !cpu_has_apic)
- return;
-
- value = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(value);
-
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
-
- /*
- * Enable APIC.
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
-
- /* This bit is reserved on P4/Xeon and should be cleared */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
(boot_cpu_data.x86 == 15))
- value &= ~APIC_SPIV_FOCUS_DISABLED;
- else
- value |= APIC_SPIV_FOCUS_DISABLED;
- value |= SPURIOUS_APIC_VECTOR;
- apic_write_around(APIC_SPIV, value);
-
- /*
- * Set up the virtual wire mode.
- */
- apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
- value = APIC_DM_NMI;
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
- value |= APIC_LVT_LEVEL_TRIGGER;
- apic_write_around(APIC_LVT1, value);
-}
-
-void __init setup_local_APIC (void)
-{
- unsigned long oldvalue, value, ver, maxlvt;
-
- /* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (esr_disable) {
- apic_write(APIC_ESR, 0);
- apic_write(APIC_ESR, 0);
- apic_write(APIC_ESR, 0);
- apic_write(APIC_ESR, 0);
- }
-
- value = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(value);
-
- if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
- __error_in_apic_c();
-
- /*
- * Double-check whether this APIC is really registered.
- */
- if (!apic_id_registered())
- BUG();
-
- /*
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
- init_apic_ldr();
-
- /*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
- */
- value = apic_read(APIC_TASKPRI);
- value &= ~APIC_TPRI_MASK;
- apic_write_around(APIC_TASKPRI, value);
-
- /*
- * Now that we are all set up, enable the APIC
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- /*
- * Enable APIC
- */
- value |= APIC_SPIV_APIC_ENABLED;
-
- /*
- * Some unknown Intel IO/APIC (or APIC) errata is biting us with
- * certain networking cards. If high frequency interrupts are
- * happening on a particular IOAPIC pin, plus the IOAPIC routing
- * entry is masked/unmasked at a high rate as well then sooner or
- * later IOAPIC line gets 'stuck', no more interrupts are received
- * from the device. If focus CPU is disabled then the hang goes
- * away, oh well :-(
- *
- * [ This bug can be reproduced easily with a level-triggered
- * PCI Ne2000 networking cards and PII/PIII processors, dual
- * BX chipset. ]
- */
- /*
- * Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
- * like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
- */
-#if 1
- /* Enable focus processor (bit==0) */
- value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
- /* Disable focus processor (bit==1) */
- value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
- /*
- * Set spurious IRQ vector
- */
- value |= SPURIOUS_APIC_VECTOR;
- apic_write_around(APIC_SPIV, value);
-
- /*
- * Set up LVT0, LVT1:
- *
- * set up through-local-APIC on the BP's LINT0. This is not
- * strictly necessery in pure symmetric-IO mode, but sometimes
- * we delegate interrupts to the 8259A.
- */
- /*
- * TODO: set up through-local-APIC from through-I/O-APIC? --macro
- */
- value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
- value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
- } else {
- value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
- }
- apic_write_around(APIC_LVT0, value);
-
- /*
- * only the BP should see the LINT1 NMI signal, obviously.
- */
- if (!smp_processor_id())
- value = APIC_DM_NMI;
- else
- value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!APIC_INTEGRATED(ver)) /* 82489DX */
- value |= APIC_LVT_LEVEL_TRIGGER;
- apic_write_around(APIC_LVT1, value);
-
- if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
- maxlvt = get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
-
- value = ERROR_APIC_VECTOR; // enables sending errors
- apic_write_around(APIC_LVTERR, value);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- if (esr_disable)
- /*
- * Something untraceble is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk("Leaving ESR disabled.\n");
- else
- printk("No ESR for 82489DX.\n");
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- setup_apic_nmi_watchdog();
- apic_pm_activate();
-}
-
-/*
- * If Linux enabled the LAPIC against the BIOS default
- * disable it down before re-entering the BIOS on shutdown.
- * Otherwise the BIOS may get confused and not power-off.
- */
-void lapic_shutdown(void)
-{
- if (!cpu_has_apic || !enabled_via_apicbase)
- return;
-
- local_irq_disable();
- disable_local_APIC();
- local_irq_enable();
-}
-
-#ifdef CONFIG_PM
-
-static struct {
- int active;
- /* r/w apic fields */
- unsigned int apic_id;
- unsigned int apic_taskpri;
- unsigned int apic_ldr;
- unsigned int apic_dfr;
- unsigned int apic_spiv;
- unsigned int apic_lvtt;
- unsigned int apic_lvtpc;
- unsigned int apic_lvt0;
- unsigned int apic_lvt1;
- unsigned int apic_lvterr;
- unsigned int apic_tmict;
- unsigned int apic_tdcr;
- unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
-
- if (!apic_pm_state.active)
- return 0;
-
- apic_pm_state.apic_id = apic_read(APIC_ID);
- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
- apic_pm_state.apic_ldr = apic_read(APIC_LDR);
- apic_pm_state.apic_dfr = apic_read(APIC_DFR);
- apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
- apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
- return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
- unsigned int l, h;
- unsigned long flags;
-
- if (!apic_pm_state.active)
- return 0;
-
- local_irq_save(flags);
-
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
- apic_write(APIC_ID, apic_pm_state.apic_id);
- apic_write(APIC_DFR, apic_pm_state.apic_dfr);
- apic_write(APIC_LDR, apic_pm_state.apic_ldr);
- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
- apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
- apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- local_irq_restore(flags);
- return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
- set_kset_name("lapic"),
- .resume = lapic_resume,
- .suspend = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
-};
-
-static void __init apic_pm_activate(void)
-{
- apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
- int error;
-
- if (!cpu_has_apic)
- return 0;
- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- */
-
-/*
- * Knob to control our willingness to enable the local APIC.
- */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
-
-static int __init lapic_disable(char *str)
-{
- enable_local_apic = -1;
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- return 0;
-}
-__setup("nolapic", lapic_disable);
-
-static int __init lapic_enable(char *str)
-{
- enable_local_apic = 1;
- return 0;
-}
-__setup("lapic", lapic_enable);
-
-static int __init apic_set_verbosity(char *str)
-{
- if (strcmp("debug", str) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", str) == 0)
- apic_verbosity = APIC_VERBOSE;
- else
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug", str);
-
- return 0;
-}
-
-__setup("apic=", apic_set_verbosity);
-
-static int __init detect_init_APIC (void)
-{
- u32 h, l, features;
- extern void get_cpu_vendor(struct cpuinfo_x86*);
-
- /* Disabled by kernel option? */
- if (enable_local_apic < 0)
- return -1;
-
- /* Workaround for us being called before identify_cpu(). */
- get_cpu_vendor(&boot_cpu_data);
-
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
- (boot_cpu_data.x86 == 15))
- break;
- goto no_apic;
- case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && cpu_has_apic))
- break;
- goto no_apic;
- default:
- goto no_apic;
- }
-
- if (!cpu_has_apic) {
- /*
- * Over-ride BIOS and try to enable the local
- * APIC only if "lapic" specified.
- */
- if (enable_local_apic <= 0) {
- printk("Local APIC disabled by BIOS -- "
- "you can enable it with \"lapic\"\n");
- return -1;
- }
- /*
- * Some BIOSes disable the local APIC in the
- * APIC_BASE MSR. This can only be done in
- * software for Intel P6 or later and AMD K7
- * (Model > 1) or later.
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- printk("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
- }
- }
- /*
- * The APIC feature bit should now be enabled
- * in `cpuid'
- */
- features = cpuid_edx(1);
- if (!(features & (1 << X86_FEATURE_APIC))) {
- printk("Could not enable APIC!\n");
- return -1;
- }
- set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
- if (nmi_watchdog != NMI_NONE)
- nmi_watchdog = NMI_LOCAL_APIC;
-
- printk("Found and enabled local APIC!\n");
-
- apic_pm_activate();
-
- return 0;
-
-no_apic:
- printk("No local APIC present or hardware disabled\n");
- return -1;
-}
-
-void __init init_apic_mappings(void)
-{
- unsigned long apic_phys;
-
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
- if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
- apic_phys = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
- apic_phys);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
-#ifdef CONFIG_X86_IO_APIC
- {
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
- } else {
-fake_ioapic_page:
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
- }
-#endif
-}
-
-/*
- * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
- * per second. We assume that the caller has already set up the local
- * APIC.
- *
- * The APIC timer is not exactly sync with the external timer chip, it
- * closely follows bus clocks.
- */
-
-/*
- * The timer chip is already set up at HZ interrupts per second here,
- * but we do not accept timer interrupts yet. We only allow the BP
- * to calibrate.
- */
-static unsigned int __init get_8254_timer_count(void)
-{
- extern spinlock_t i8253_lock;
- unsigned long flags;
-
- unsigned int count;
-
- spin_lock_irqsave(&i8253_lock, flags);
-
- outb_p(0x00, PIT_MODE);
- count = inb_p(PIT_CH0);
- count |= inb_p(PIT_CH0) << 8;
-
- spin_unlock_irqrestore(&i8253_lock, flags);
-
- return count;
-}
-
-/* next tick in 8254 can be caught by catching timer wraparound */
-static void __init wait_8254_wraparound(void)
-{
- unsigned int curr_count, prev_count;
-
- curr_count = get_8254_timer_count();
- do {
- prev_count = curr_count;
- curr_count = get_8254_timer_count();
-
- /* workaround for broken Mercury/Neptune */
- if (prev_count >= curr_count + 0x100)
- curr_count = get_8254_timer_count();
-
- } while (prev_count >= curr_count);
-}
-
-/*
- * Default initialization for 8254 timers. If we use other timers like HPET,
- * we override this later
- */
-void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound;
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-
-#define APIC_DIVISOR 16
-
-static void __setup_APIC_LVTT(unsigned int clocks)
-{
- unsigned int lvtt_value, tmp_value, ver;
-
- ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
- if (!APIC_INTEGRATED(ver))
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
- apic_write_around(APIC_LVTT, lvtt_value);
-
- /*
- * Divide PICLK by 16
- */
- tmp_value = apic_read(APIC_TDCR);
- apic_write_around(APIC_TDCR, (tmp_value
- & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
- | APIC_TDR_DIV_16);
-
- apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
-}
-
-static void __init setup_APIC_timer(unsigned int clocks)
-{
- unsigned long flags;
-
- local_irq_save(flags);
-
- /*
- * Wait for IRQ0's slice:
- */
- wait_timer_tick();
-
- __setup_APIC_LVTT(clocks);
-
- local_irq_restore(flags);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-static int __init calibrate_APIC_clock(void)
-{
- unsigned long long t1 = 0, t2 = 0;
- long tt1, tt2;
- long result;
- int i;
- const int LOOPS = HZ/10;
-
- apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
-
- /*
- * Put whatever arbitrary (but long enough) timeout
- * value into the APIC clock, we just want to get the
- * counter running for calibration.
- */
- __setup_APIC_LVTT(1000000000);
-
- /*
- * The timer chip counts down to zero. Let's wait
- * for a wraparound to start exact measurement:
- * (the current tick might have been already half done)
- */
-
- wait_timer_tick();
-
- /*
- * We wrapped around just now. Let's start:
- */
- if (cpu_has_tsc)
- rdtscll(t1);
- tt1 = apic_read(APIC_TMCCT);
-
- /*
- * Let's wait LOOPS wraprounds:
- */
- for (i = 0; i < LOOPS; i++)
- wait_timer_tick();
-
- tt2 = apic_read(APIC_TMCCT);
- if (cpu_has_tsc)
- rdtscll(t2);
-
- /*
- * The APIC bus clock counter is 32 bits only, it
- * might have overflown, but note that we use signed
- * longs, thus no extra care needed.
- *
- * underflown to be exact, as the timer counts down ;)
- */
-
- result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
- if (cpu_has_tsc)
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- ((long)(t2-t1)/LOOPS)/(1000000/HZ),
- ((long)(t2-t1)/LOOPS)%(1000000/HZ));
-
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%ld.%04ld MHz.\n",
- result/(1000000/HZ),
- result%(1000000/HZ));
-
- return result;
-}
-
-static unsigned int calibration_result;
-
-void __init setup_boot_APIC_clock(void)
-{
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
- using_apic_timer = 1;
-
- local_irq_disable();
-
- calibration_result = calibrate_APIC_clock();
- /*
- * Now set up the timer for real.
- */
- setup_APIC_timer(calibration_result);
-
- local_irq_enable();
-}
-
-void __init setup_secondary_APIC_clock(void)
-{
- setup_APIC_timer(calibration_result);
-}
-
-void __init disable_APIC_timer(void)
-{
- if (using_apic_timer) {
- unsigned long v;
-
- v = apic_read(APIC_LVTT);
- apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
- }
-}
-
-void enable_APIC_timer(void)
-{
- if (using_apic_timer) {
- unsigned long v;
-
- v = apic_read(APIC_LVTT);
- apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
- }
-}
-
-/*
- * the frequency of the profiling timer can be changed
- * by writing a multiplier value into /proc/profile.
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
- int i;
-
- /*
- * Sanity check. [at least 500 APIC cycles should be
- * between APIC interrupts as a rule of thumb, to avoid
- * irqs flooding us]
- */
- if ( (!multiplier) || (calibration_result/multiplier < 500))
- return -EINVAL;
-
- /*
- * Set the new multiplier for each CPU. CPUs don't start using the
- * new values until the next timer interrupt in which they do process
- * accounting. At that time they also adjust their APIC timers
- * accordingly.
- */
- for (i = 0; i < NR_CPUS; ++i)
- per_cpu(prof_multiplier, i) = multiplier;
-
- return 0;
-}
-
-#undef APIC_DIVISOR
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(struct pt_regs * regs)
-{
- int cpu = smp_processor_id();
-
- profile_tick(CPU_PROFILING, regs);
- if (--per_cpu(prof_counter, cpu) <= 0) {
- /*
- * The multiplier may have changed since the last time we got
- * to this point as a result of the user writing to
- * /proc/profile. In this case we need to adjust the APIC
- * timer accordingly.
- *
- * Interrupts are already masked off at this point.
- */
- per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
- if (per_cpu(prof_counter, cpu) !=
- per_cpu(prof_old_multiplier, cpu)) {
- __setup_APIC_LVTT(
- calibration_result/
- per_cpu(prof_counter, cpu));
- per_cpu(prof_old_multiplier, cpu) =
- per_cpu(prof_counter, cpu);
- }
-
-#ifdef CONFIG_SMP
- update_process_times(user_mode(regs));
-#endif
- }
-
- /*
- * We take the 'long' return path, and there every subsystem
- * grabs the apropriate locks (kernel lock/ irq lock).
- *
- * we might want to decouple profiling from the 'long path',
- * and do the profiling totally in assembly.
- *
- * Currently this isn't too much of an issue (performance wise),
- * we can take more than 100K local irqs per second on a 100 MHz P5.
- */
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
- */
-
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
- int cpu = smp_processor_id();
-
- /*
- * the NMI deadlock-detector uses this.
- */
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- irq_enter();
- smp_local_timer_interrupt(regs);
- irq_exit();
-}
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-fastcall void smp_spurious_interrupt(struct pt_regs *regs)
-{
- unsigned long v;
-
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never
happen.\n",
- smp_processor_id());
- irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-
-fastcall void smp_error_interrupt(struct pt_regs *regs)
-{
- unsigned long v, v1;
-
- irq_enter();
- /* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
- atomic_inc(&irq_err_count);
-
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
- smp_processor_id(), v , v1);
- irq_exit();
-}
-
/*
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
int __init APIC_init_uniprocessor (void)
{
- if (enable_local_apic < 0)
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-
- if (!smp_found_config && !cpu_has_apic)
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic &&
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- return -1;
- }
-
- verify_local_APIC();
-
- connect_bsp_APIC();
-
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
-
- setup_local_APIC();
-
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config)
if (!skip_ioapic_setup && nr_ioapics)
setup_IO_APIC();
#endif
- setup_boot_APIC_clock();
return 0;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c
linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c 2005-07-28
13:17:07.000000000 -0700
@@ -15,6 +15,7 @@
#include <asm/apic.h>
#include <mach_apic.h>
#endif
+#include <asm-xen/hypervisor.h>
#include "cpu.h"
@@ -32,6 +33,8 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM
extern void mcheck_init(struct cpuinfo_x86 *c);
+extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c);
+
extern int disable_pse;
static void default_init(struct cpuinfo_x86 * c)
@@ -409,6 +412,8 @@ void __init identify_cpu(struct cpuinfo_
c->x86_vendor, c->x86_model);
}
+ machine_specific_modify_cpu_capabilities(c);
+
/* Now the feature flags better reflect actual CPU features! */
printk(KERN_DEBUG "CPU: After all inits, caps:");
@@ -554,6 +559,24 @@ void __init early_cpu_init(void)
disable_pse = 1;
#endif
}
+
+void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
+{
+ unsigned long frames[16];
+ unsigned long va;
+ int f;
+
+ for (va = gdt_descr->address, f = 0;
+ va < gdt_descr->address + gdt_descr->size;
+ va += PAGE_SIZE, f++) {
+ frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+ make_page_readonly((void *)va);
+ }
+ if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
+ BUG();
+ lgdt_finish();
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -565,7 +588,6 @@ void __init cpu_init (void)
int cpu = smp_processor_id();
struct tss_struct * t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = ¤t->thread;
- __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
if (cpu_test_and_set(cpu, cpu_initialized)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -573,7 +595,7 @@ void __init cpu_init (void)
}
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+ if (cpu_has_vme || cpu_has_de)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
if (tsc_disable && cpu_has_tsc) {
printk(KERN_NOTICE "Disabling TSC...\n");
@@ -583,30 +605,12 @@ void __init cpu_init (void)
}
/*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
- GDT_SIZE);
-
- /* Set up GDT entry for 16bit stack */
- *(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
- ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
- (CPU_16BIT_STACK_SIZE - 1);
-
- cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
- cpu_gdt_descr[cpu].address =
- (unsigned long)&per_cpu(cpu_gdt_table, cpu);
-
- /*
* Set up the per-thread TLS descriptor cache:
*/
- memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu),
- GDT_ENTRY_TLS_ENTRIES * 8);
+ memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN],
+ GDT_ENTRY_TLS_ENTRIES * 8);
- __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu]));
- __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+ cpu_gdt_init(&cpu_gdt_descr[cpu]);
/*
* Delete NT
@@ -623,19 +627,15 @@ void __init cpu_init (void)
enter_lazy_tlb(&init_mm, current);
load_esp0(t, thread);
- set_tss_desc(cpu,t);
- load_TR_desc();
- load_LDT(&init_mm.context);
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+ load_LDT(&init_mm.context);
/* Clear %fs and %gs. */
asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
/* Clear all 6 debug registers: */
-#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
+#define CD(register) HYPERVISOR_set_debugreg(register, 0)
CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile
linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -2,18 +2,30 @@
# Makefile for x86-compatible CPU details and quirks
#
-obj-y := common.o proc.o
+CFLAGS += -Iarch/i386/kernel/cpu
-obj-y += amd.o
-obj-y += cyrix.o
-obj-y += centaur.o
-obj-y += transmeta.o
-obj-y += intel.o intel_cacheinfo.o
-obj-y += rise.o
-obj-y += nexgen.o
-obj-y += umc.o
+obj-y := common.o
+c-obj-y += proc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
+c-obj-y += amd.o
+c-obj-y += cyrix.o
+c-obj-y += centaur.o
+c-obj-y += transmeta.o
+c-obj-y += intel.o intel_cacheinfo.o
+c-obj-y += rise.o
+c-obj-y += nexgen.o
+c-obj-y += umc.o
+
+#obj-$(CONFIG_X86_MCE) += ../../../../i386/kernel/cpu/mcheck/
obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+#obj-$(CONFIG_CPU_FREQ) += ../../../../i386/kernel/cpu/cpufreq/
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+ @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@
+
+obj-y += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c
linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c 2005-07-28
13:17:07.000000000 -0700
@@ -1,116 +1,46 @@
-/* Generic MTRR (Memory Type Range Register) driver.
-
- Copyright (C) 1997-2000 Richard Gooch
- Copyright (c) 2002 Patrick Mochel
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
- Richard Gooch may be reached by email at rgooch@xxxxxxxxxxxxx
- The postal address is:
- Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-
- Source: "Pentium Pro Family Developer's Manual, Volume 3:
- Operating System Writer's Guide" (Intel document number 242692),
- section 11.11.7
-
- This was cleaned and made readable by Patrick Mochel <mochel@xxxxxxxx>
- on 6-7 March 2002.
- Source: Intel Architecture Software Developers Manual, Volume 3:
- System Programming Guide; Section 9.11. (1997 edition - PPro).
-*/
-
-#include <linux/module.h>
#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
#include <asm/mtrr.h>
-
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
#include "mtrr.h"
-#define MTRR_VERSION "2.0 (20020519)"
-
-u32 num_var_ranges = 0;
-
-unsigned int *usage_table;
-static DECLARE_MUTEX(main_lock);
-
-u32 size_or_mask, size_and_mask;
-
-static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
-
-struct mtrr_ops * mtrr_if = NULL;
-
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned int *size, mtrr_type * type)
+{
+ dom0_op_t op;
-extern int arr3_protected;
+ op.cmd = DOM0_READ_MEMTYPE;
+ op.u.read_memtype.reg = reg;
+ (void)HYPERVISOR_dom0_op(&op);
-void set_mtrr_ops(struct mtrr_ops * ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
+ *size = op.u.read_memtype.nr_pfns;
+ *base = op.u.read_memtype.pfn;
+ *type = op.u.read_memtype.type;
}
-/* Returns non-zero if we have the write-combining memory type */
-static int have_wrcomb(void)
-{
- struct pci_dev *dev;
- u8 rev;
-
- if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
- /* ServerWorks LE chipsets < rev 6 have problems with
write-combining
- Don't allow it and leave room for other chipsets to be
tagged */
- if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- printk(KERN_INFO "mtrr: Serverworks LE rev < 6
detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
- }
- /* Intel 450NX errata # 23. Non ascending cacheline evictions to
- write combining memory may resulting in data corruption */
- if (dev->vendor == PCI_VENDOR_ID_INTEL &&
- dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
- printk(KERN_INFO "mtrr: Intel 450NX MMC detected.
Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
- pci_dev_put(dev);
- }
- return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
-}
+struct mtrr_ops generic_mtrr_ops = {
+ .use_intel_if = 1,
+ .get = generic_get_mtrr,
+};
+
+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int *usage_table;
-/* This function returns the number of variable MTRRs */
static void __init set_num_var_ranges(void)
{
- unsigned long config = 0, dummy;
+ dom0_op_t op;
- if (use_intel()) {
- rdmsr(MTRRcap_MSR, config, dummy);
- } else if (is_cpu(AMD))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
- num_var_ranges = config & 0xff;
+ for (num_var_ranges = 0; ; num_var_ranges++) {
+ op.cmd = DOM0_READ_MEMTYPE;
+ op.u.read_memtype.reg = num_var_ranges;
+ if (HYPERVISOR_dom0_op(&op) != 0)
+ break;
+ }
}
static void __init init_table(void)
@@ -124,293 +54,28 @@ static void __init init_table(void)
return;
}
for (i = 0; i < max; i++)
- usage_table[i] = 1;
-}
-
-struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
- unsigned long smp_base;
- unsigned long smp_size;
- unsigned int smp_reg;
- mtrr_type smp_type;
-};
-
-#ifdef CONFIG_SMP
-
-static void ipi_handler(void *info)
-/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
- [RETURNS] Nothing.
-*/
-{
- struct set_mtrr_data *data = info;
- unsigned long flags;
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while(!atomic_read(&data->gate))
- cpu_relax();
-
- /* The master has cleared me to execute */
- if (data->smp_reg != ~0U)
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- else
- mtrr_if->set_all();
-
- atomic_dec(&data->count);
- while(atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
+ usage_table[i] = 0;
}
-#endif
-
-/**
- * set_mtrr - update mtrrs on all processors
- * @reg: mtrr in question
- * @base: mtrr base
- * @size: mtrr size
- * @type: mtrr type
- *
- * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
- *
- * 1. Send IPI to do the following:
- * 2. Disable Interrupts
- * 3. Wait for all procs to do so
- * 4. Enter no-fill cache mode
- * 5. Flush caches
- * 6. Clear PGE bit
- * 7. Flush all TLBs
- * 8. Disable all range registers
- * 9. Update the MTRRs
- * 10. Enable all range registers
- * 11. Flush all TLBs and caches again
- * 12. Enter normal cache mode and reenable caching
- * 13. Set PGE
- * 14. Wait for buddies to catch up
- * 15. Enable interrupts.
- *
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
- * CPU goes through the transition of updating MTRRs. The CPU vendors may each
do it
- * differently, so we call mtrr_if->set() callback and let them take care of
it.
- * When they're done, they again decrement data->count and wait for data.gate
to
- * be reset.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate
flag.
- * Everyone then enables interrupts and we all continue on.
- *
- * Note that the mechanism is the same for UP systems, too; all the SMP stuff
- * becomes nops.
- */
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data;
- unsigned long flags;
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
- atomic_set(&data.gate,0);
-
- /* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
-
- local_irq_save(flags);
-
- while(atomic_read(&data.count))
- cpu_relax();
-
- /* ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- atomic_set(&data.gate,1);
-
- /* do our MTRR business */
-
- /* HACK!
- * We use this same function to initialize the mtrrs on boot.
- * The state of the boot cpu's mtrrs has been saved, and we want
- * to replicate across all the APs.
- * If we're doing that @reg is set to something special...
- */
- if (reg != ~0U)
- mtrr_if->set(reg,base,size,type);
-
- /* wait for the others */
- while(atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- atomic_set(&data.gate,0);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while(atomic_read(&data.count))
- cpu_relax();
-
- local_irq_restore(flags);
-}
-
-/**
- * mtrr_add_page - Add a memory type region
- * @base: Physical base address of region in pages (4 KB)
- * @size: Physical size of region in pages (4 KB)
- * @type: Type of MTRR desired
- * @increment: If this is true do usage counting on the region
- *
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
- *
- * If the region cannot be added either because all regions are in use
- * or the CPU cannot support it a negative value is returned. On success
- * the register number for this entry is returned, but should be treated
- * as a cookie only.
- *
- * On a multiprocessor machine the changes are made to all processors.
- * This is required on x86 by the Intel processors.
- *
- * The available types are
- *
- * %MTRR_TYPE_UNCACHABLE - No caching
- *
- * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
- * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
- * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
- *
- * BUGS: Needs a quiet flag for the cases where drivers do not mind
- * failures and do not wish system log messages to be sent.
- */
-
int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, char increment)
{
- int i;
- mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
int error;
+ dom0_op_t op;
- if (!mtrr_if)
- return -ENXIO;
-
- if ((error = mtrr_if->validate_add_page(base,size,type)))
+ op.cmd = DOM0_ADD_MEMTYPE;
+ op.u.add_memtype.pfn = base;
+ op.u.add_memtype.nr_pfns = size;
+ op.u.add_memtype.type = type;
+ if ((error = HYPERVISOR_dom0_op(&op)))
return error;
- if (type >= MTRR_NUM_TYPES) {
- printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
- return -EINVAL;
- }
-
- /* If the type is WC, check that this processor supports it */
- if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- printk(KERN_WARNING
- "mtrr: your processor doesn't support
write-combining\n");
- return -ENOSYS;
- }
-
- if (base & size_or_mask || size & size_or_mask) {
- printk(KERN_WARNING "mtrr: base or size exceeds the MTRR
width\n");
- return -EINVAL;
- }
+ if (increment)
+ ++usage_table[op.u.add_memtype.reg];
- error = -EINVAL;
-
- /* Search for existing MTRR */
- down(&main_lock);
- for (i = 0; i < num_var_ranges; ++i) {
- mtrr_if->get(i, &lbase, &lsize, <ype);
- if (base >= lbase + lsize)
- continue;
- if ((base < lbase) && (base + size <= lbase))
- continue;
- /* At this point we know there is some kind of
overlap/enclosure */
- if ((base < lbase) || (base + size > lbase + lsize)) {
- printk(KERN_WARNING
- "mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%x000\n", base, size, lbase,
- lsize);
- goto out;
- }
- /* New region is enclosed by an existing region */
- if (ltype != type) {
- if (type == MTRR_TYPE_UNCACHABLE)
- continue;
- printk (KERN_WARNING "mtrr: type mismatch for
%lx000,%lx000 old: %s new: %s\n",
- base, size, mtrr_attrib_to_str(ltype),
- mtrr_attrib_to_str(type));
- goto out;
- }
- if (increment)
- ++usage_table[i];
- error = i;
- goto out;
- }
- /* Search for an empty MTRR */
- i = mtrr_if->get_free_region(base, size);
- if (i >= 0) {
- set_mtrr(i, base, size, type);
- usage_table[i] = 1;
- } else
- printk(KERN_INFO "mtrr: no more MTRRs available\n");
- error = i;
- out:
- up(&main_lock);
- return error;
+ return op.u.add_memtype.reg;
}
-/**
- * mtrr_add - Add a memory type region
- * @base: Physical base address of region
- * @size: Physical size of region
- * @type: Type of MTRR desired
- * @increment: If this is true do usage counting on the region
- *
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
- *
- * If the region cannot be added either because all regions are in use
- * or the CPU cannot support it a negative value is returned. On success
- * the register number for this entry is returned, but should be treated
- * as a cookie only.
- *
- * On a multiprocessor machine the changes are made to all processors.
- * This is required on x86 by the Intel processors.
- *
- * The available types are
- *
- * %MTRR_TYPE_UNCACHABLE - No caching
- *
- * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
- * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
- * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
- *
- * BUGS: Needs a quiet flag for the cases where drivers do not mind
- * failures and do not wish system log messages to be sent.
- */
-
int
mtrr_add(unsigned long base, unsigned long size, unsigned int type,
char increment)
@@ -424,21 +89,6 @@ mtrr_add(unsigned long base, unsigned lo
increment);
}
-/**
- * mtrr_del_page - delete a memory type region
- * @reg: Register returned by mtrr_add
- * @base: Physical base address
- * @size: Size of region
- *
- * If register is supplied then base and size are ignored. This is
- * how drivers should call it.
- *
- * Releases an MTRR region. If the usage count drops to zero the
- * register is freed and the region returns to default state.
- * On success the register is returned, on failure a negative error
- * code.
- */
-
int mtrr_del_page(int reg, unsigned long base, unsigned long size)
{
int i, max;
@@ -446,12 +96,9 @@ int mtrr_del_page(int reg, unsigned long
unsigned long lbase;
unsigned int lsize;
int error = -EINVAL;
-
- if (!mtrr_if)
- return -ENXIO;
+ dom0_op_t op;
max = num_var_ranges;
- down(&main_lock);
if (reg < 0) {
/* Search for existing MTRR */
for (i = 0; i < max; ++i) {
@@ -467,46 +114,20 @@ int mtrr_del_page(int reg, unsigned long
goto out;
}
}
- if (reg >= max) {
- printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
- goto out;
- }
- if (is_cpu(CYRIX) && !use_intel()) {
- if ((reg == 3) && arr3_protected) {
- printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
- goto out;
- }
- }
- mtrr_if->get(reg, &lbase, &lsize, <ype);
- if (lsize < 1) {
- printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
- goto out;
- }
if (usage_table[reg] < 1) {
printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
goto out;
}
- if (--usage_table[reg] < 1)
- set_mtrr(reg, 0, 0, 0);
+ if (--usage_table[reg] < 1) {
+ op.cmd = DOM0_DEL_MEMTYPE;
+ op.u.del_memtype.handle = 0;
+ op.u.add_memtype.reg = reg;
+ (void)HYPERVISOR_dom0_op(&op);
+ }
error = reg;
out:
- up(&main_lock);
return error;
}
-/**
- * mtrr_del - delete a memory type region
- * @reg: Register returned by mtrr_add
- * @base: Physical base address
- * @size: Size of region
- *
- * If register is supplied then base and size are ignored. This is
- * how drivers should call it.
- *
- * Releases an MTRR region. If the usage count drops to zero the
- * register is freed and the region returns to default state.
- * On success the register is returned, on failure a negative error
- * code.
- */
int
mtrr_del(int reg, unsigned long base, unsigned long size)
@@ -522,157 +143,23 @@ mtrr_del(int reg, unsigned long base, un
EXPORT_SYMBOL(mtrr_add);
EXPORT_SYMBOL(mtrr_del);
-/* HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-extern void amd_init_mtrr(void);
-extern void cyrix_init_mtrr(void);
-extern void centaur_init_mtrr(void);
-
-static void __init init_ifs(void)
-{
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-}
-
-static void __init init_other_cpus(void)
+static int __init mtrr_init(void)
{
- if (use_intel())
- get_mtrr_state();
-
- /* bring up the other processors */
- set_mtrr(~0U,0,0,0);
-
- if (use_intel()) {
- finalize_mtrr_state();
- mtrr_state_warn();
- }
-}
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ if (!(xen_start_info.flags & SIF_PRIVILEGED))
+ return -ENODEV;
-struct mtrr_value {
- mtrr_type ltype;
- unsigned long lbase;
- unsigned int lsize;
-};
+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+ return -ENODEV;
-static struct mtrr_value * mtrr_state;
+ set_num_var_ranges();
+ init_table();
-static int mtrr_save(struct sys_device * sysdev, u32 state)
-{
- int i;
- int size = num_var_ranges * sizeof(struct mtrr_value);
-
- mtrr_state = kmalloc(size,GFP_ATOMIC);
- if (mtrr_state)
- memset(mtrr_state,0,size);
- else
- return -ENOMEM;
-
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i,
- &mtrr_state[i].lbase,
- &mtrr_state[i].lsize,
- &mtrr_state[i].ltype);
- }
return 0;
}
-static int mtrr_restore(struct sys_device * sysdev)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_state[i].lsize)
- set_mtrr(i,
- mtrr_state[i].lbase,
- mtrr_state[i].lsize,
- mtrr_state[i].ltype);
- }
- kfree(mtrr_state);
- return 0;
-}
-
-
-
-static struct sysdev_driver mtrr_sysdev_driver = {
- .suspend = mtrr_save,
- .resume = mtrr_restore,
-};
-
-
-/**
- * mtrr_init - initialize mtrrs on the boot CPU
- *
- * This needs to be called early; before any of the other CPUs are
- * initialized (i.e. before smp_init()).
- *
- */
-static int __init mtrr_init(void)
-{
- init_ifs();
-
- if (cpu_has_mtrr) {
- mtrr_if = &generic_mtrr_ops;
- size_or_mask = 0xff000000; /* 36 bits */
- size_and_mask = 0x00f00000;
-
- /* This is an AMD specific MSR, but we assume(hope?) that
- Intel will implement it to when they extend the address
- bus of the Xeon. */
- if (cpuid_eax(0x80000000) >= 0x80000008) {
- u32 phys_addr;
- phys_addr = cpuid_eax(0x80000008) & 0xff;
- size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
- size_and_mask = ~size_or_mask & 0xfff00000;
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
- boot_cpu_data.x86 == 6) {
- /* VIA C* family have Intel style MTRRs, but
- don't support PAE */
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- } else {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (cpu_has_k6_mtrr) {
- /* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CENTAUR:
- if (cpu_has_centaur_mcr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CYRIX:
- if (cpu_has_cyrix_arr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- default:
- break;
- }
- }
- printk(KERN_INFO "mtrr: v%s\n",MTRR_VERSION);
-
- if (mtrr_if) {
- set_num_var_ranges();
- init_table();
- init_other_cpus();
-
- return sysdev_driver_register(&cpu_sysdev_class,
- &mtrr_sysdev_driver);
- }
- return -ENXIO;
-}
-
subsys_initcall(mtrr_init);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile
linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -1,5 +1,16 @@
-obj-y := main.o if.o generic.o state.o
-obj-y += amd.o
-obj-y += cyrix.o
-obj-y += centaur.o
+obj-y := main.o
+c-obj-y := if.o
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h
+ @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@
+
+$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h
+
+$(obj)/mtrr.h:
+ @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@
+
+obj-y += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/entry.S
linux-2.6-xen-sparse/arch/i386/kernel/entry.S
--- pristine-linux-2.6.12/arch/i386/kernel/entry.S 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/entry.S 2005-07-28
13:17:07.000000000 -0700
@@ -47,8 +47,8 @@
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page.h>
-#include <asm/desc.h>
#include "irq_vectors.h"
+#include <asm-xen/xen-public/xen.h>
#define nr_syscalls ((syscall_table_size)/4)
@@ -64,6 +64,7 @@ ES = 0x20
ORIG_EAX = 0x24
EIP = 0x28
CS = 0x2C
+EVENT_MASK = 0x2E
EFLAGS = 0x30
OLDESP = 0x34
OLDSS = 0x38
@@ -75,11 +76,43 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending /* 0 */
+#define evtchn_upcall_mask 1
+
+#define sizeof_vcpu_shift 3
+
+#ifdef CONFIG_SMP
+#define preempt_disable(reg) incl TI_preempt_count(reg)
+#define preempt_enable(reg) decl TI_preempt_count(reg)
+#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp) ; \
+ movl TI_cpu(%ebp),reg ; \
+ shl $sizeof_vcpu_shift,reg ; \
+ addl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp)
+#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
+#else
+#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg)
+#define XEN_PUT_VCPU_INFO_fixup
+#endif
+
+#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
+#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
+ XEN_LOCKED_BLOCK_EVENTS(reg) ; \
+ XEN_PUT_VCPU_INFO(reg)
+#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg)
; \
+ XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
+ XEN_PUT_VCPU_INFO(reg)
+#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
+
#ifdef CONFIG_PREEMPT
-#define preempt_stop cli
+#define preempt_stop GET_THREAD_INFO(%ebp) ; \
+ XEN_BLOCK_EVENTS(%esi)
#else
#define preempt_stop
-#define resume_kernel restore_nocheck
+#define resume_kernel restore_all
#endif
#define SAVE_ALL \
@@ -123,6 +156,23 @@ VM_MASK = 0x00020000
.previous
+#define RESTORE_ALL \
+ RESTORE_REGS \
+ addl $4, %esp; \
+1: iret; \
+.section .fixup,"ax"; \
+2: movl $(__USER_DS), %edx; \
+ movl %edx, %ds; \
+ movl %edx, %es; \
+ movl $11,%eax; \
+ call do_exit; \
+.previous; \
+.section __ex_table,"a";\
+ .align 4; \
+ .long 1b,2b; \
+.previous
+
+
ENTRY(ret_from_fork)
pushl %eax
call schedule_tail
@@ -145,10 +195,10 @@ ret_from_intr:
GET_THREAD_INFO(%ebp)
movl EFLAGS(%esp), %eax # mix EFLAGS and CS
movb CS(%esp), %al
- testl $(VM_MASK | 3), %eax
- jz resume_kernel
+ testl $(VM_MASK | 2), %eax
+ jz resume_kernel # returning to kernel or vm86-space
ENTRY(resume_userspace)
- cli # make sure we don't miss an interrupt
+ XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -159,15 +209,15 @@ ENTRY(resume_userspace)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- cli
+ XEN_BLOCK_EVENTS(%esi)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_nocheck
+ jnz restore_all
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
jz restore_all
- testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
+ testb $0xFF,EVENT_MASK(%esp) # interrupts off (exception path) ?
+ jnz restore_all
call preempt_schedule_irq
jmp need_resched
#endif
@@ -202,8 +252,7 @@ sysenter_past_esp:
SAVE_ALL
GET_THREAD_INFO(%ebp)
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not
testb */
- testw
$(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -227,8 +276,7 @@ ENTRY(system_call)
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not
testb */
- testw
$(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -236,63 +284,31 @@ syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp) # store the return value
syscall_exit:
- cli # make sure we don't miss an interrupt
+ XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
-
restore_all:
- movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb OLDSS(%esp), %ah
- movb CS(%esp), %al
- andl $(VM_MASK | (4 << 8) | 3), %eax
- cmpl $((4 << 8) | 3), %eax
- je ldt_ss # returning to user-space with LDT SS
-restore_nocheck:
- RESTORE_REGS
- addl $4, %esp
-1: iret
-.section .fixup,"ax"
-iret_exc:
- sti
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
-.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
+ testl $VM_MASK, EFLAGS(%esp)
+ jnz resume_vm86
+ movb EVENT_MASK(%esp), %al
+ notb %al # %al == ~saved_mask
+ XEN_GET_VCPU_INFO(%esi)
+ andb evtchn_upcall_mask(%esi),%al
+ andb $1,%al # %al == mask & ~saved_mask
+ jnz restore_all_enable_events # != 0 => reenable event delivery
+ XEN_PUT_VCPU_INFO(%esi)
+ RESTORE_ALL
-ldt_ss:
- larl OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
- /* If returning to userspace with 16bit stack,
- * try to fix the higher word of ESP, as the CPU
- * won't restore it.
- * This is an "official" bug of all the x86-compatible
- * CPUs, which we can try to work around to make
- * dosemu and wine happy. */
- subl $8, %esp # reserve space for switch16 pointer
- cli
- movl %esp, %eax
- /* Set up the 16bit stack frame with switch32 pointer on top,
- * and a switch16 pointer on top of the current frame. */
- call setup_x86_bogus_stack
+resume_vm86:
+ XEN_UNBLOCK_EVENTS(%esi)
RESTORE_REGS
- lss 20+4(%esp), %esp # switch to 16bit stack
-1: iret
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
+ movl %eax,(%esp)
+ movl $__HYPERVISOR_switch_vm86,%eax
+ int $0x82
+ ud2
# perform work that needs to be done immediately before resumption
ALIGN
@@ -301,7 +317,7 @@ work_pending:
jz work_notifysig
work_resched:
call schedule
- cli # make sure we don't miss an interrupt
+ XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -348,7 +364,7 @@ syscall_trace_entry:
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
- sti # could let do_syscall_trace() call
+ XEN_UNBLOCK_EVENTS(%esi) # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
@@ -368,27 +384,7 @@ syscall_badsys:
movl $-ENOSYS,EAX(%esp)
jmp resume_userspace
-#define FIXUP_ESPFIX_STACK \
- movl %esp, %eax; \
- /* switch to 32bit stack using the pointer on top of 16bit stack */ \
- lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
- /* copy data from 16bit stack to 32bit stack */ \
- call fixup_x86_bogus_stack; \
- /* put ESP to the proper location */ \
- movl %eax, %esp;
-#define UNWIND_ESPFIX_STACK \
- pushl %eax; \
- movl %ss, %eax; \
- /* see if on 16bit stack */ \
- cmpw $__ESPFIX_SS, %ax; \
- jne 28f; \
- movl $__KERNEL_DS, %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
- /* switch to 32bit stack */ \
- FIXUP_ESPFIX_STACK \
-28: popl %eax;
-
+#if 0 /* XEN */
/*
* Build the entry stubs and pointer table with
* some assembler magic.
@@ -426,6 +422,7 @@ ENTRY(name) \
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
+#endif /* XEN */
ENTRY(divide_error)
pushl $0 # no error code
@@ -443,9 +440,7 @@ error_code:
pushl %ecx
pushl %ebx
cld
- pushl %es
- UNWIND_ESPFIX_STACK
- popl %ecx
+ movl %es, %ecx
movl ES(%esp), %edi # get the function address
movl ORIG_EAX(%esp), %edx # get the error code
movl %eax, ORIG_EAX(%esp)
@@ -457,6 +452,118 @@ error_code:
call *%edi
jmp ret_from_exception
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+ pushl %eax
+ SAVE_ALL
+ movl EIP(%esp),%eax
+ cmpl $scrit,%eax
+ jb 11f
+ cmpl $ecrit,%eax
+ jb critical_region_fixup
+11: push %esp
+ call evtchn_do_upcall
+ add $4,%esp
+ jmp ret_from_intr
+
+ ALIGN
+restore_all_enable_events:
+ XEN_LOCKED_UNBLOCK_EVENTS(%esi)
+scrit: /**** START OF CRITICAL REGION ****/
+ XEN_TEST_PENDING(%esi)
+ jnz 14f # process more events if necessary...
+ XEN_PUT_VCPU_INFO(%esi)
+ RESTORE_ALL
+14: XEN_LOCKED_BLOCK_EVENTS(%esi)
+ XEN_PUT_VCPU_INFO(%esi)
+ jmp 11b
+ecrit: /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+ addl $critical_fixup_table-scrit,%eax
+ movzbl (%eax),%eax # %eax contains num bytes popped
+ cmpb $0xff,%al # 0xff => vcpu_info critical region
+ jne 15f
+ GET_THREAD_INFO(%ebp)
+ XEN_PUT_VCPU_INFO(%esi) # abort vcpu_info critical region
+ xorl %eax,%eax
+15: mov %esp,%esi
+ add %eax,%esi # %esi points at end of src region
+ mov %esp,%edi
+ add $0x34,%edi # %edi points at end of dst region
+ mov %eax,%ecx
+ shr $2,%ecx # convert words to bytes
+ je 17f # skip loop if nothing to copy
+16: subl $4,%esi # pre-decrementing copy loop
+ subl $4,%edi
+ movl (%esi),%eax
+ movl %eax,(%edi)
+ loop 16b
+17: movl %edi,%esp # final %edi is top of merged stack
+ jmp 11b
+
+critical_fixup_table:
+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = XEN_TEST_PENDING
+ .byte 0xff,0xff # jnz 14f
+ XEN_PUT_VCPU_INFO_fixup
+ .byte 0x00 # pop %ebx
+ .byte 0x04 # pop %ecx
+ .byte 0x08 # pop %edx
+ .byte 0x0c # pop %esi
+ .byte 0x10 # pop %edi
+ .byte 0x14 # pop %ebp
+ .byte 0x18 # pop %eax
+ .byte 0x1c # pop %ds
+ .byte 0x20 # pop %es
+ .byte 0x24,0x24,0x24 # add $4,%esp
+ .byte 0x28 # iret
+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
+ XEN_PUT_VCPU_INFO_fixup
+ .byte 0x00,0x00 # jmp 11b
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+1: popl %ds
+2: popl %es
+3: popl %fs
+4: popl %gs
+ subl $4,%esp
+ SAVE_ALL
+ jmp ret_from_exception
+.section .fixup,"ax"; \
+6: movl $0,(%esp); \
+ jmp 1b; \
+7: movl $0,(%esp); \
+ jmp 2b; \
+8: movl $0,(%esp); \
+ jmp 3b; \
+9: movl $0,(%esp); \
+ jmp 4b; \
+.previous; \
+.section __ex_table,"a";\
+ .align 4; \
+ .long 1b,6b; \
+ .long 2b,7b; \
+ .long 3b,8b; \
+ .long 4b,9b; \
+.previous
+
ENTRY(coprocessor_error)
pushl $0
pushl $do_coprocessor_error
@@ -470,17 +577,9 @@ ENTRY(simd_coprocessor_error)
ENTRY(device_not_available)
pushl $-1 # mark this as an int
SAVE_ALL
- movl %cr0, %eax
- testl $0x4, %eax # EM (math emulation bit)
- jne device_not_available_emulate
preempt_stop
call math_state_restore
jmp ret_from_exception
-device_not_available_emulate:
- pushl $0 # temporary storage for ORIG_EIP
- call math_emulate
- addl $4, %esp
- jmp ret_from_exception
/*
* Debug traps and NMI can happen at the one SYSENTER instruction
@@ -516,6 +615,7 @@ debug_stack_correct:
call do_debug
jmp ret_from_exception
+#if 0 /* XEN */
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
@@ -525,11 +625,6 @@ debug_stack_correct:
* fault happened on the sysenter path.
*/
ENTRY(nmi)
- pushl %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- je nmi_16bit_stack
cmpl $sysenter_entry,(%esp)
je nmi_stack_fixup
pushl %eax
@@ -549,7 +644,7 @@ nmi_stack_correct:
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
- jmp restore_all
+ RESTORE_ALL
nmi_stack_fixup:
FIX_STACK(12,nmi_stack_correct, 1)
@@ -564,29 +659,7 @@ nmi_debug_stack_check:
nmi_debug_stack_fixup:
FIX_STACK(24,nmi_stack_correct, 1)
jmp nmi_stack_correct
-
-nmi_16bit_stack:
- /* create the pointer to lss back */
- pushl %ss
- pushl %esp
- movzwl %sp, %esp
- addw $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- .endr
- pushl %eax
- SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to 16bit stack
-1: iret
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
+#endif /* XEN */
ENTRY(int3)
pushl $-1 # mark this as an int
@@ -636,9 +709,33 @@ ENTRY(alignment_check)
pushl $do_alignment_check
jmp error_code
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+# fastcall register usage: %eax = pt_regs, %edx = error code,
+# %ecx = fault address
ENTRY(page_fault)
- pushl $do_page_fault
- jmp error_code
+ pushl %ds
+ pushl %eax
+ xorl %eax, %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ decl %eax /* eax = -1 */
+ pushl %ecx
+ pushl %ebx
+ cld
+ movl %es,%edi
+ movl ES(%esp), %ecx /* get the faulting address */
+ movl ORIG_EAX(%esp), %edx /* get the error code */
+ movl %eax, ORIG_EAX(%esp)
+ movl %edi, ES(%esp)
+ movl $(__KERNEL_DS),%eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %esp,%eax /* pt_regs pointer */
+ call do_page_fault
+ jmp ret_from_exception
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
@@ -647,9 +744,8 @@ ENTRY(machine_check)
jmp error_code
#endif
-ENTRY(spurious_interrupt_bug)
- pushl $0
- pushl $do_spurious_interrupt_bug
+ENTRY(fixup_4gb_segment)
+ pushl $do_fixup_4gb_segment
jmp error_code
#include "syscall_table.S"
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/head.S
linux-2.6-xen-sparse/arch/i386/kernel/head.S
--- pristine-linux-2.6.12/arch/i386/kernel/head.S 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/head.S 2005-07-28
13:17:07.000000000 -0700
@@ -1,24 +1,25 @@
-/*
- * linux/arch/i386/kernel/head.S -- the 32-bit startup code.
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Enhanced CPU detection and feature setting code by Mike Jagdis
- * and Martin Mares, November 1997.
- */
-.text
#include <linux/config.h>
+
+.section __xen_guest
+ .ascii "GUEST_OS=linux,GUEST_VER=2.6"
+ .ascii ",XEN_VER=3.0"
+ .ascii ",VIRT_BASE=0xC0000000"
+#ifdef CONFIG_X86_PAE
+ .ascii ",PAE=yes"
+#else
+ .ascii ",PAE=no"
+#endif
+ .ascii ",LOADER=generic"
+ .byte 0
+
+.text
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm_offsets.h>
-#include <asm/setup.h>
+#include <asm-xen/xen-public/arch-x86_32.h>
/*
* References to members of the new_cpu_data structure.
@@ -33,239 +34,24 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially. We need one bit for
- * each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- */
-#define INIT_MAP_BEYOND_END (128*1024)
-
-
-/*
- * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
- * %esi points to the real-mode code as a 32-bit pointer.
- * CS and DS must be 4 GB flat segments, but we don't depend on
- * any particular GDT layout, because we load our own as soon as we
- * can.
- */
ENTRY(startup_32)
-
-/*
- * Set segments to known values.
- */
cld
- lgdt boot_gdt_descr - __PAGE_OFFSET
- movl $(__BOOT_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- movl %eax,%fs
- movl %eax,%gs
-/*
- * Clear BSS first so that there are no surprises...
- * No need to cld as DF is already clear from cld above...
- */
- xorl %eax,%eax
- movl $__bss_start - __PAGE_OFFSET,%edi
- movl $__bss_stop - __PAGE_OFFSET,%ecx
- subl %edi,%ecx
- shrl $2,%ecx
- rep ; stosl
-
-/*
- * Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond _end. The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
- *
- * Warning: don't use %esi or the stack in this code. However, %esp
- * can be used as a GPR if you really need it...
- */
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
- movl $(pg0 - __PAGE_OFFSET), %edi
- movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
- movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
-10:
- leal 0x007(%edi),%ecx /* Create PDE entry */
- movl %ecx,(%edx) /* Store identity PDE entry */
- movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
- addl $4,%edx
- movl $1024, %ecx
-11:
- stosl
- addl $0x1000,%eax
- loop 11b
- /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
- /* bytes beyond the end of our own page tables; the +0x007 is the
attribute bits */
- leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
- cmpl %ebp,%eax
- jb 10b
- movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+ /* Copy the necessary stuff from xen_start_info structure. */
+ mov $xen_start_info_union,%edi
+ mov $512,%ecx
+ rep movsl
#ifdef CONFIG_SMP
- xorl %ebx,%ebx /* This is the boot CPU (BSP) */
- jmp 3f
-
-/*
- * Non-boot CPU entry point; entered from trampoline.S
- * We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt_table GDT
- * for us.
- */
ENTRY(startup_32_smp)
cld
- movl $(__BOOT_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- movl %eax,%fs
- movl %eax,%gs
-
-/*
- * New page tables may be in 4Mbyte page mode and may
- * be using the global pages.
- *
- * NOTE! If we are on a 486 we may have no cr4 at all!
- * So we do not try to touch it unless we really have
- * some bits in it to set. This won't work if the BSP
- * implements cr4 but this AP does not -- very unlikely
- * but be warned! The same applies to the pse feature
- * if not equally supported. --macro
- *
- * NOTE! We have to correct for the fact that we're
- * not yet offset PAGE_OFFSET..
- */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
- movl cr4_bits,%edx
- andl %edx,%edx
- jz 6f
- movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
- orl %edx,%eax
- movl %eax,%cr4
-
- btl $5, %eax # check if PAE is enabled
- jnc 6f
-
- /* Check if extended functions are implemented */
- movl $0x80000000, %eax
- cpuid
- cmpl $0x80000000, %eax
- jbe 6f
- mov $0x80000001, %eax
- cpuid
- /* Execute Disable bit supported? */
- btl $20, %edx
- jnc 6f
-
- /* Setup EFER (Extended Feature Enable Register) */
- movl $0xc0000080, %ecx
- rdmsr
-
- btsl $11, %eax
- /* Make changes effective */
- wrmsr
-
-6:
- /* This is a secondary processor (AP) */
- xorl %ebx,%ebx
- incl %ebx
-
-3:
#endif /* CONFIG_SMP */
-/*
- * Enable paging
- */
- movl $swapper_pg_dir-__PAGE_OFFSET,%eax
- movl %eax,%cr3 /* set the page table pointer.. */
- movl %cr0,%eax
- orl $0x80000000,%eax
- movl %eax,%cr0 /* ..and set paging (PG) bit */
- ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
-1:
/* Set up the stack pointer */
lss stack_start,%esp
-/*
- * Initialize eflags. Some BIOS's leave bits like NT set. This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
- pushl $0
- popfl
-
-#ifdef CONFIG_SMP
- andl %ebx,%ebx
- jz 1f /* Initial CPU cleans BSS */
- jmp checkCPUtype
-1:
-#endif /* CONFIG_SMP */
-
-/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
- call setup_idt
-
-/*
- * Copy bootup parameters out of the way.
- * Note: %esi still has the pointer to the real-mode data.
- */
- movl $boot_params,%edi
- movl $(PARAM_SIZE/4),%ecx
- cld
- rep
- movsl
- movl boot_params+NEW_CL_POINTER,%esi
- andl %esi,%esi
- jnz 2f # New command line protocol
- cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
- jne 1f
- movzwl OLD_CL_OFFSET,%esi
- addl $(OLD_CL_BASE_ADDR),%esi
-2:
- movl $saved_command_line,%edi
- movl $(COMMAND_LINE_SIZE/4),%ecx
- rep
- movsl
-1:
checkCPUtype:
- movl $-1,X86_CPUID # -1 for no CPUID initially
-
-/* check if it is 486 or 386. */
-/*
- * XXX - this does a lot of unnecessary setup. Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
- */
-
- movb $3,X86 # at least 386
- pushfl # push EFLAGS
- popl %eax # get EFLAGS
- movl %eax,%ecx # save original EFLAGS
- xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
- pushl %eax # copy to EFLAGS
- popfl # set EFLAGS
- pushfl # get new EFLAGS
- popl %eax # put it in eax
- xorl %ecx,%eax # change in flags
- pushl %ecx # restore original EFLAGS
- popfl
- testl $0x40000,%eax # check if AC bit changed
- je is386
-
- movb $4,X86 # at least 486
- testl $0x200000,%eax # check if ID bit changed
- je is486
-
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
cpuid
@@ -274,9 +60,6 @@ checkCPUtype:
movl %edx,X86_VENDOR_ID+4 # next 4 chars
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
- orl %eax,%eax # do we have processor info as well?
- je is486
-
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
movb %al,%cl # save reg for future use
@@ -289,32 +72,13 @@ checkCPUtype:
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
-is486: movl $0x50022,%ecx # set AM, WP, NE and MP
- jmp 2f
-
-is386: movl $2,%ecx # set MP
-2: movl %cr0,%eax
- andl $0x80000011,%eax # Save PG,PE,ET
- orl %ecx,%eax
- movl %eax,%cr0
-
- call check_x87
incb ready
- lgdt cpu_gdt_descr
- lidt idt_descr
- ljmp $(__KERNEL_CS),$1f
-1: movl $(__KERNEL_DS),%eax # reload all the segment registers
- movl %eax,%ss # after changing gdt.
-
- movl $(__USER_DS),%eax # DS/ES contains default USER segment
- movl %eax,%ds
- movl %eax,%es
xorl %eax,%eax # Clear FS/GS and LDT
movl %eax,%fs
movl %eax,%gs
- lldt %ax
cld # gcc2 wants the direction flag cleared at all
times
+
#ifdef CONFIG_SMP
movb ready, %cl
cmpb $1,%cl
@@ -329,100 +93,18 @@ L6:
jmp L6 # main should never return here, but
# just in case, we know what happens.
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
- movb $0,X86_HARD_MATH
- clts
- fninit
- fstsw %ax
- cmpb $0,%al
- je 1f
- movl %cr0,%eax /* no coprocessor: have to set bits */
- xorl $4,%eax /* set EM */
- movl %eax,%cr0
- ret
- ALIGN
-1: movb $1,X86_HARD_MATH
- .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
- ret
+ENTRY(lgdt_finish)
+ movl $(__KERNEL_DS),%eax # reload all the segment registers
+ movw %ax,%ss # after changing gdt.
-/*
- * setup_idt
- *
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
- *
- * Warning: %esi is live across this function.
- */
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
-
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
- movl %eax,(%edi)
- movl %edx,4(%edi)
- addl $8,%edi
- dec %ecx
- jne rp_sidt
- ret
+ movl $(__USER_DS),%eax # DS/ES contains default USER segment
+ movw %ax,%ds
+ movw %ax,%es
-/* This is the default interrupt "handler" :-) */
- ALIGN
-ignore_int:
- cld
-#ifdef CONFIG_PRINTK
+ popl %eax # reload CS by intersegment return
+ pushl $(__KERNEL_CS)
pushl %eax
- pushl %ecx
- pushl %edx
- pushl %es
- pushl %ds
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- pushl 16(%esp)
- pushl 24(%esp)
- pushl 32(%esp)
- pushl 40(%esp)
- pushl $int_msg
- call printk
- addl $(5*4),%esp
- popl %ds
- popl %es
- popl %edx
- popl %ecx
- popl %eax
-#endif
- iret
-
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
-/*
- * BSS section
- */
-.section ".bss.page_aligned","w"
-ENTRY(swapper_pg_dir)
- .fill 1024,4,0
-ENTRY(empty_zero_page)
- .fill 4096,1,0
-
-/*
- * This starts the data section.
- */
-.data
+ lret
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
@@ -430,27 +112,10 @@ ENTRY(stack_start)
ready: .byte 0
-int_msg:
- .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
-
-/*
- * The IDT and GDT 'descriptors' are a strange 48-bit object
- * only used by the lidt and lgdt instructions. They are not
- * like usual segment descriptors - they consist of a 16-bit
- * segment size, and 32-bit linear address value:
- */
-
-.globl boot_gdt_descr
.globl idt_descr
.globl cpu_gdt_descr
ALIGN
-# early boot GDT descriptor (must use 1:1 address mapping)
- .word 0 # 32 bit align gdt_desc.address
-boot_gdt_descr:
- .word __BOOT_DS+7
- .long boot_gdt_table - __PAGE_OFFSET
-
.word 0 # 32-bit align idt_desc.address
idt_descr:
.word IDT_ENTRIES*8-1 # idt contains 256 entries
@@ -459,25 +124,18 @@ idt_descr:
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
cpu_gdt_descr:
- .word GDT_ENTRIES*8-1
+ .word GDT_SIZE
.long cpu_gdt_table
.fill NR_CPUS-1,8,0 # space for the other GDT descriptors
-/*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
- * used only for booting.
- */
- .align L1_CACHE_BYTES
-ENTRY(boot_gdt_table)
- .fill GDT_ENTRY_BOOT_CS,8,0
- .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
- .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+.org 0x1000
+ENTRY(empty_zero_page)
-/*
- * The Global Descriptor Table contains 28 quadwords, per-CPU.
- */
- .align PAGE_SIZE_asm
+.org 0x2000
+ENTRY(swapper_pg_dir)
+
+.org 0x3000
ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x0000000000000000 /* 0x0b reserved */
@@ -492,32 +150,49 @@ ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* 0x53 reserved */
.quad 0x0000000000000000 /* 0x5b reserved */
- .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
- .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
- .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
- .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
+#ifdef CONFIG_X86_PAE
+ .quad 0x00cfbb00000067ff /* 0x60 kernel 4GB code at 0x00000000 */
+ .quad 0x00cfb300000067ff /* 0x68 kernel 4GB data at 0x00000000 */
+ .quad 0x00cffb00000067ff /* 0x73 user 4GB code at 0x00000000 */
+ .quad 0x00cff300000067ff /* 0x7b user 4GB data at 0x00000000 */
+#else
+ .quad 0x00cfbb000000c3ff /* 0x60 kernel 4GB code at 0x00000000 */
+ .quad 0x00cfb3000000c3ff /* 0x68 kernel 4GB data at 0x00000000 */
+ .quad 0x00cffb000000c3ff /* 0x73 user 4GB code at 0x00000000 */
+ .quad 0x00cff3000000c3ff /* 0x7b user 4GB data at 0x00000000 */
+#endif
.quad 0x0000000000000000 /* 0x80 TSS descriptor */
.quad 0x0000000000000000 /* 0x88 LDT descriptor */
/* Segments used for calling PnP BIOS */
- .quad 0x00c09a0000000000 /* 0x90 32-bit code */
- .quad 0x00809a0000000000 /* 0x98 16-bit code */
- .quad 0x0080920000000000 /* 0xa0 16-bit data */
- .quad 0x0080920000000000 /* 0xa8 16-bit data */
- .quad 0x0080920000000000 /* 0xb0 16-bit data */
+ .quad 0x0000000000000000 /* 0x90 32-bit code */
+ .quad 0x0000000000000000 /* 0x98 16-bit code */
+ .quad 0x0000000000000000 /* 0xa0 16-bit data */
+ .quad 0x0000000000000000 /* 0xa8 16-bit data */
+ .quad 0x0000000000000000 /* 0xb0 16-bit data */
/*
* The APM segments have byte granularity and their bases
* and limits are set at run time.
*/
- .quad 0x00409a0000000000 /* 0xb8 APM CS code */
- .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */
- .quad 0x0040920000000000 /* 0xc8 APM DS data */
+ .quad 0x0000000000000000 /* 0xb8 APM CS code */
+ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
+ .quad 0x0000000000000000 /* 0xc8 APM DS data */
- .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
+ .quad 0x0000000000000000 /* 0xd0 - unused */
.quad 0x0000000000000000 /* 0xd8 - unused */
.quad 0x0000000000000000 /* 0xe0 - unused */
.quad 0x0000000000000000 /* 0xe8 - unused */
.quad 0x0000000000000000 /* 0xf0 - unused */
.quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault
TSS */
+ .fill GDT_ENTRIES-32,8,0
+.org 0x4000
+ENTRY(default_ldt)
+
+.org 0x5000
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c
linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c
--- pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c 2005-07-28
13:17:07.000000000 -0700
@@ -76,7 +76,9 @@ EXPORT_SYMBOL(ioremap_nocache);
EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(kernel_thread);
EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_ACPI_BOOT
EXPORT_SYMBOL(pm_power_off);
+#endif
EXPORT_SYMBOL(get_cmos_time);
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/io_apic.c
linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c
--- pristine-linux-2.6.12/arch/i386/kernel/io_apic.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c 2005-07-28
13:17:07.000000000 -0700
@@ -42,6 +42,48 @@
#include "io_ports.h"
+#ifdef CONFIG_XEN
+
+#include <asm-xen/xen-public/xen.h>
+#include <asm-xen/xen-public/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq) ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int
reg)
+{
+ physdev_op_t op;
+ int ret;
+
+ op.cmd = PHYSDEVOP_APIC_READ;
+ op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+ op.u.apic_op.offset = reg;
+ ret = HYPERVISOR_physdev_op(&op);
+ if (ret)
+ return ret;
+ return op.u.apic_op.value;
+}
+
+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg,
unsigned int value)
+{
+ physdev_op_t op;
+
+ op.cmd = PHYSDEVOP_APIC_WRITE;
+ op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+ op.u.apic_op.offset = reg;
+ op.u.apic_op.value = value;
+ HYPERVISOR_physdev_op(&op);
+}
+
+#define io_apic_read(a,r) xen_io_apic_read(a,r)
+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
+
+#endif /* CONFIG_XEN */
+
int (*ioapic_renumber_irq)(int ioapic, int irq);
atomic_t irq_mis_count;
@@ -107,6 +149,7 @@ static void add_pin_to_irq(unsigned int
entry->pin = pin;
}
+#ifndef CONFIG_XEN
/*
* Reroute an IRQ to a different pin.
*/
@@ -243,6 +286,9 @@ static void set_ioapic_affinity_irq(unsi
}
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+#else
+#define clear_IO_APIC() ((void)0)
+#endif
#if defined(CONFIG_IRQBALANCE)
# include <asm/processor.h> /* kernel_thread() */
@@ -664,6 +710,7 @@ static inline void move_irq(int irq) { }
#ifndef CONFIG_SMP
void fastcall send_IPI_self(int vector)
{
+#ifndef CONFIG_XEN
unsigned int cfg;
/*
@@ -675,6 +722,7 @@ void fastcall send_IPI_self(int vector)
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
+#endif
}
#endif /* !CONFIG_SMP */
@@ -744,6 +792,7 @@ static int find_irq_entry(int apic, int
return -1;
}
+#ifndef CONFIG_XEN
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
@@ -766,6 +815,7 @@ static int find_isa_irq_pin(int irq, int
}
return -1;
}
+#endif
/*
* Find a specific PCI IRQ entry.
@@ -813,6 +863,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
return best_guess;
}
+#ifndef CONFIG_XEN
/*
* This function currently is only a helper for the i386 smp boot process
where
* we need to reprogram the ioredtbls to cater for the cpus which have come
online
@@ -836,6 +887,7 @@ void __init setup_ioapic_dest(void)
}
}
+#endif /* !CONFIG_XEN */
/*
* EISA Edge/Level control register, ELCR
@@ -1125,26 +1177,22 @@ static inline int IO_APIC_irq_trigger(in
}
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */
int assign_irq_vector(int irq)
{
- static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+ static int current_vector = FIRST_DEVICE_VECTOR;
+ physdev_op_t op;
BUG_ON(irq >= NR_IRQ_VECTORS);
if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
return IO_APIC_VECTOR(irq);
-next:
- current_vector += 8;
- if (current_vector == SYSCALL_VECTOR)
- goto next;
-
- if (current_vector >= FIRST_SYSTEM_VECTOR) {
- offset++;
- if (!(offset%8))
- return -ENOSPC;
- current_vector = FIRST_DEVICE_VECTOR + offset;
- }
+
+ op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
+ op.u.irq_op.irq = irq;
+ if (HYPERVISOR_physdev_op(&op))
+ return -ENOSPC;
+ current_vector = op.u.irq_op.vector;
vector_irq[current_vector] = irq;
if (irq != AUTO_ASSIGN)
@@ -1153,6 +1201,7 @@ next:
return current_vector;
}
+#ifndef CONFIG_XEN
static struct hw_interrupt_type ioapic_level_type;
static struct hw_interrupt_type ioapic_edge_type;
@@ -1178,6 +1227,9 @@ static inline void ioapic_register_intr(
set_intr_gate(vector, interrupt[irq]);
}
}
+#else
+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
+#endif
static void __init setup_IO_APIC_irqs(void)
{
@@ -1233,7 +1285,7 @@ static void __init setup_IO_APIC_irqs(vo
else
add_pin_to_irq(irq, apic, pin);
- if (!apic && !IO_APIC_IRQ(irq))
+ if (/*!apic &&*/ !IO_APIC_IRQ(irq))
continue;
if (IO_APIC_IRQ(irq)) {
@@ -1258,6 +1310,7 @@ static void __init setup_IO_APIC_irqs(vo
/*
* Set up the 8259A-master output pin:
*/
+#ifndef CONFIG_XEN
static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
@@ -1452,8 +1505,6 @@ void __init print_IO_APIC(void)
return;
}
-#if 0
-
static void print_APIC_bitfield (int base)
{
unsigned int v;
@@ -1595,8 +1646,9 @@ void /*__init*/ print_PIC(void)
v = inb(0x4d1) << 8 | inb(0x4d0);
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
-
-#endif /* 0 */
+#else
+void __init print_IO_APIC(void) { }
+#endif /* !CONFIG_XEN */
static void __init enable_IO_APIC(void)
{
@@ -1638,7 +1690,9 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
+#ifndef CONFIG_XEN
disconnect_bsp_APIC();
+#endif
}
/*
@@ -1648,7 +1702,7 @@ void disable_IO_APIC(void)
* by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999
*/
-#ifndef CONFIG_X86_NUMAQ
+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
@@ -1755,6 +1809,7 @@ static void __init setup_ioapic_ids_from
static void __init setup_ioapic_ids_from_mpc(void) { }
#endif
+#ifndef CONFIG_XEN
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1979,6 +2034,7 @@ static struct hw_interrupt_type ioapic_l
.end = end_level_ioapic,
.set_affinity = set_ioapic_affinity,
};
+#endif /* !CONFIG_XEN */
static inline void init_IO_APIC_traps(void)
{
@@ -2010,13 +2066,16 @@ static inline void init_IO_APIC_traps(vo
*/
if (irq < 16)
make_8259A_irq(irq);
+#ifndef CONFIG_XEN
else
/* Strange. Oh, well.. */
irq_desc[irq].handler = &no_irq_type;
+#endif
}
}
}
+#ifndef CONFIG_XEN
static void enable_lapic_irq (unsigned int irq)
{
unsigned long v;
@@ -2243,6 +2302,9 @@ static inline void check_timer(void)
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option");
}
+#else
+#define check_timer() ((void)0)
+#endif
/*
*
@@ -2269,7 +2331,9 @@ void __init setup_IO_APIC(void)
*/
if (!acpi_ioapic)
setup_ioapic_ids_from_mpc();
+#ifndef CONFIG_XEN
sync_Arb_IDs();
+#endif
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
@@ -2391,6 +2455,7 @@ device_initcall(ioapic_init_sysfs);
int __init io_apic_get_unique_id (int ioapic, int apic_id)
{
+#ifndef CONFIG_XEN
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
physid_mask_t tmp;
@@ -2457,6 +2522,7 @@ int __init io_apic_get_unique_id (int io
apic_printk(APIC_VERBOSE, KERN_INFO
"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+#endif /* !CONFIG_XEN */
return apic_id;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/ioport.c
linux-2.6-xen-sparse/arch/i386/kernel/ioport.c
--- pristine-linux-2.6.12/arch/i386/kernel/ioport.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/ioport.c 2005-07-28
13:17:07.000000000 -0700
@@ -15,6 +15,7 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
+#include <asm-xen/xen-public/physdev.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int
extent, int new_value)
@@ -56,10 +57,9 @@ static void set_bitmap(unsigned long *bi
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
- unsigned long i, max_long, bytes, bytes_updated;
struct thread_struct * t = ¤t->thread;
- struct tss_struct * tss;
unsigned long *bitmap;
+ physdev_op_t op;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
@@ -78,41 +78,15 @@ asmlinkage long sys_ioperm(unsigned long
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
- }
- /*
- * do it in the per-thread copy and in the TSS ...
- *
- * Disable preemption via get_cpu() - we must not switch away
- * because the ->io_bitmap_max value must match the bitmap
- * contents:
- */
- tss = &per_cpu(init_tss, get_cpu());
+ op.cmd = PHYSDEVOP_SET_IOBITMAP;
+ op.u.set_iobitmap.bitmap = (unsigned long)bitmap;
+ op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS;
+ HYPERVISOR_physdev_op(&op);
+ }
set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
- /*
- * Search for a (possibly new) maximum. This is simple and stupid,
- * to keep it obviously correct:
- */
- max_long = 0;
- for (i = 0; i < IO_BITMAP_LONGS; i++)
- if (t->io_bitmap_ptr[i] != ~0UL)
- max_long = i;
-
- bytes = (max_long + 1) * sizeof(long);
- bytes_updated = max(bytes, t->io_bitmap_max);
-
- t->io_bitmap_max = bytes;
-
- /*
- * Sets the lazy trigger so that the next I/O operation will
- * reload the correct bitmap.
- */
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-
- put_cpu();
-
return 0;
}
@@ -127,21 +101,29 @@ asmlinkage long sys_ioperm(unsigned long
* code.
*/
-asmlinkage long sys_iopl(unsigned long unused)
+asmlinkage long sys_iopl(unsigned int new_io_pl)
{
- volatile struct pt_regs * regs = (struct pt_regs *) &unused;
- unsigned int level = regs->ebx;
- unsigned int old = (regs->eflags >> 12) & 3;
+ unsigned int old_io_pl = current->thread.io_pl;
+ physdev_op_t op;
- if (level > 3)
+ if (new_io_pl > 3)
return -EINVAL;
- /* Trying to gain more privileges? */
- if (level > old) {
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
- }
- regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
- /* Make sure we return the long way (not sysenter) */
- set_thread_flag(TIF_IRET);
+
+ /* Need "raw I/O" privileges for direct port access. */
+ if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /* Maintain OS privileges even if user attempts to relinquish them. */
+ if (new_io_pl == 0)
+ new_io_pl = 1;
+
+ /* Change our version of the privilege levels. */
+ current->thread.io_pl = new_io_pl;
+
+ /* Force the change at ring 0. */
+ op.cmd = PHYSDEVOP_SET_IOPL;
+ op.u.set_iopl.iopl = new_io_pl;
+ HYPERVISOR_physdev_op(&op);
+
return 0;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/irq.c
linux-2.6-xen-sparse/arch/i386/kernel/irq.c
--- pristine-linux-2.6.12/arch/i386/kernel/irq.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/irq.c 2005-07-28 13:17:07.000000000
-0700
@@ -15,6 +15,9 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -51,7 +54,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
fastcall unsigned int do_IRQ(struct pt_regs *regs)
{
/* high bits used in ret_from_ code */
- int irq = regs->orig_eax & 0xff;
+ int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS);
#ifdef CONFIG_4KSTACKS
union irq_ctx *curctx, *irqctx;
u32 *isp;
@@ -210,9 +213,8 @@ int show_interrupts(struct seq_file *p,
if (i == 0) {
seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
+ for_each_cpu(j)
+ seq_printf(p, "CPU%d ",j);
seq_putc(p, '\n');
}
@@ -225,9 +227,8 @@ int show_interrupts(struct seq_file *p,
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ for_each_cpu(j)
+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
seq_printf(p, " %14s", irq_desc[i].handler->typename);
seq_printf(p, " %s", action->name);
@@ -240,16 +241,13 @@ skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", nmi_count(j));
+ for_each_cpu(j)
+ seq_printf(p, "%10u ", nmi_count(j));
seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).apic_timer_irqs);
+ for_each_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(irq_stat,
j).apic_timer_irqs);
seq_putc(p, '\n');
#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
@@ -259,3 +257,43 @@ skip:
}
return 0;
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_affinity[irq], map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if (irq_desc[irq].action)
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+#if 0
+ barrier();
+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+ [note the nop - the interrupt-enable boundary on x86 is two
+ instructions from sti] - to flush out pending hardirqs and
+ IPIs. After this point nothing is supposed to reach this CPU." */
+ __asm__ __volatile__("sti; nop; cli");
+ barrier();
+#else
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+#endif
+}
+#endif
+
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/ldt.c
linux-2.6-xen-sparse/arch/i386/kernel/ldt.c
--- pristine-linux-2.6.12/arch/i386/kernel/ldt.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/ldt.c 2005-07-28 13:17:07.000000000
-0700
@@ -18,6 +18,7 @@
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
+#include <asm/mmu_context.h>
#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
static void flush_ldt(void *null)
@@ -58,16 +59,20 @@ static int alloc_ldt(mm_context_t *pc, i
#ifdef CONFIG_SMP
cpumask_t mask;
preempt_disable();
+#endif
+ make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) /
+ PAGE_SIZE);
load_LDT(pc);
+#ifdef CONFIG_SMP
mask = cpumask_of_cpu(smp_processor_id());
if (!cpus_equal(current->mm->cpu_vm_mask, mask))
smp_call_function(flush_ldt, NULL, 1, 1);
preempt_enable();
-#else
- load_LDT(pc);
#endif
}
if (oldsize) {
+ make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) /
+ PAGE_SIZE);
if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(oldldt);
else
@@ -82,6 +87,8 @@ static inline int copy_ldt(mm_context_t
if (err < 0)
return err;
memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+ make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) /
+ PAGE_SIZE);
return 0;
}
@@ -94,14 +101,19 @@ int init_new_context(struct task_struct
struct mm_struct * old_mm;
int retval = 0;
+ memset(&mm->context, 0, sizeof(mm->context));
init_MUTEX(&mm->context.sem);
- mm->context.size = 0;
old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
down(&old_mm->context.sem);
retval = copy_ldt(&mm->context, &old_mm->context);
up(&old_mm->context.sem);
}
+ if (retval == 0) {
+ spin_lock(&mm_unpinned_lock);
+ list_add(&mm->context.unpinned, &mm_unpinned);
+ spin_unlock(&mm_unpinned_lock);
+ }
return retval;
}
@@ -113,12 +125,20 @@ void destroy_context(struct mm_struct *m
if (mm->context.size) {
if (mm == current->active_mm)
clear_LDT();
+ make_pages_writable(mm->context.ldt,
+ (mm->context.size * LDT_ENTRY_SIZE) /
+ PAGE_SIZE);
if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(mm->context.ldt);
else
kfree(mm->context.ldt);
mm->context.size = 0;
}
+ if (!mm->context.pinned) {
+ spin_lock(&mm_unpinned_lock);
+ list_del(&mm->context.unpinned);
+ spin_unlock(&mm_unpinned_lock);
+ }
}
static int read_ldt(void __user * ptr, unsigned long bytecount)
@@ -178,6 +198,7 @@ static int write_ldt(void __user * ptr,
{
struct mm_struct * mm = current->mm;
__u32 entry_1, entry_2, *lp;
+ unsigned long mach_lp;
int error;
struct user_desc ldt_info;
@@ -206,6 +227,7 @@ static int write_ldt(void __user * ptr,
}
lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *)
mm->context.ldt);
+ mach_lp = arbitrary_virt_to_machine(lp);
/* Allow LDTs to be cleared by the user. */
if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
@@ -223,9 +245,7 @@ static int write_ldt(void __user * ptr,
/* Install the new entry ... */
install:
- *lp = entry_1;
- *(lp+1) = entry_2;
- error = 0;
+ error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2);
out_unlock:
up(&mm->context.sem);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/Makefile
linux-2.6-xen-sparse/arch/i386/kernel/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -2,41 +2,52 @@
# Makefile for the linux kernel.
#
-extra-y := head.o init_task.o vmlinux.lds
+XENARCH := $(subst ",,$(CONFIG_XENARCH))
-obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
- ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o quirks.o
+CFLAGS += -Iarch/$(XENARCH)/kernel
+
+extra-y := head.o init_task.o
+
+obj-y := process.o signal.o entry.o traps.o \
+ time.o ioport.o ldt.o setup.o \
+ pci-dma.o i386_ksyms.o irq.o quirks.o
+
+c-obj-y := semaphore.o vm86.o \
+ ptrace.o sys_i386.o \
+ i387.o dmi_scan.o bootflag.o \
+ doublefault.o
+s-obj-y :=
obj-y += cpu/
-obj-y += timers/
+#obj-y += timers/
obj-$(CONFIG_ACPI_BOOT) += acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
-obj-$(CONFIG_MCA) += mca.o
-obj-$(CONFIG_X86_MSR) += msr.o
-obj-$(CONFIG_X86_CPUID) += cpuid.o
+#c-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
+c-obj-$(CONFIG_MCA) += mca.o
+c-obj-$(CONFIG_X86_MSR) += msr.o
+c-obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_MICROCODE) += microcode.o
-obj-$(CONFIG_APM) += apm.o
+c-obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o
-obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
+#obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o
+c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
-obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o
-obj-$(CONFIG_X86_NUMAQ) += numaq.o
-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
-obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_MODULES) += module.o
-obj-y += sysenter.o vsyscall.o
-obj-$(CONFIG_ACPI_SRAT) += srat.o
-obj-$(CONFIG_HPET_TIMER) += time_hpet.o
-obj-$(CONFIG_EFI) += efi.o efi_stub.o
-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o
+c-obj-$(CONFIG_X86_NUMAQ) += numaq.o
+c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
+c-obj-$(CONFIG_MODULES) += module.o
+c-obj-y += sysenter.o
+obj-y += vsyscall.o
+c-obj-$(CONFIG_ACPI_SRAT) += srat.o
+c-obj-$(CONFIG_HPET_TIMER) += time_hpet.o
+c-obj-$(CONFIG_EFI) += efi.o efi_stub.o
+c-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o
EXTRA_AFLAGS := -traditional
-obj-$(CONFIG_SCx200) += scx200.o
+c-obj-$(CONFIG_SCx200) += scx200.o
# vsyscall.o contains the vsyscall DSO images as __initdata.
# We must build both images before we can assemble it.
@@ -58,7 +69,7 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall
$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
$(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
- $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
+ $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
# We also create a special relocatable object that should mirror the symbol
@@ -70,5 +81,21 @@ $(obj)/built-in.o: ld_flags += -R $(obj)
SYSCFLAGS_vsyscall-syms.o = -r
$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
- $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
+ $(obj)/vsyscall-sysenter.o FORCE
$(call if_changed,syscall)
+
+c-link := init_task.o
+s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o
vsyscall.lds.o syscall_table.o
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst
%.o,$(obj)/%.S,$(s-obj-y) $(s-link)):
+ @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@
+
+$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S
+
+$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S
+
+obj-y += $(c-obj-y) $(s-obj-y)
+obj-m += $(c-obj-m)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/microcode.c
linux-2.6-xen-sparse/arch/i386/kernel/microcode.c
--- pristine-linux-2.6.12/arch/i386/kernel/microcode.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/microcode.c 2005-07-28
13:17:07.000000000 -0700
@@ -18,55 +18,6 @@
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@xxxxxxxx> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@xxxxxxxxx> and
- * Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@xxxxxxxxx> and
- * Tigran Aivazian <tigran@xxxxxxxxxxx>,
- * Serialize updates as required on HT processors due to
speculative
- * nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@xxxxxxxxx>,
- * Jun Nakajima <jun.nakajima@xxxxxxxxx>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@xxxxxxxxxxx>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
*/
//#define DEBUG /* pr_debug */
@@ -79,6 +30,7 @@
#include <linux/miscdevice.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
+#include <linux/syscalls.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
@@ -88,342 +40,41 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Mi
MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>");
MODULE_LICENSE("GPL");
-#define MICROCODE_VERSION "1.14"
+#define MICROCODE_VERSION "1.14-xen"
#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /*
2048 bytes */
-#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /*
20 bytes */
-#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12
bytes */
-#define DWSIZE (sizeof (u32))
-#define get_totalsize(mc) \
- (((microcode_t *)mc)->hdr.totalsize ? \
- ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
- (((microcode_t *)mc)->hdr.datasize ? \
- ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DECLARE_MUTEX(microcode_sem);
static void __user *user_buffer; /* user area microcode data buffer */
static unsigned int user_buffer_size; /* it's size */
-
-typedef enum mc_error_code {
- MC_SUCCESS = 0,
- MC_NOTFOUND = 1,
- MC_MARKED = 2,
- MC_ALLOCATED = 3,
-} mc_error_code_t;
-
-static struct ucode_cpu_info {
- unsigned int sig;
- unsigned int pf;
- unsigned int rev;
- unsigned int cksum;
- mc_error_code_t err;
- microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
static int microcode_open (struct inode *unused1, struct file *unused2)
{
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
-static void collect_cpu_info (void *unused)
-{
- int cpu_num = smp_processor_id();
- struct cpuinfo_x86 *c = cpu_data + cpu_num;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- unsigned int val[2];
-
- uci->sig = uci->pf = uci->rev = uci->cksum = 0;
- uci->err = MC_NOTFOUND;
- uci->mc = NULL;
-
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel
processor\n", cpu_num);
- return;
- } else {
- uci->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- uci->pf = 1 << ((val[1] >> 18) & 7);
- }
- }
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- uci->sig, uci->pf, uci->rev);
-}
-
-static inline void mark_microcode_update (int cpu_num, microcode_header_t
*mc_header, int sig, int pf, int cksum)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- pr_debug("Microcode Found.\n");
- pr_debug(" Header Revision 0x%x\n", mc_header->hdrver);
- pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver);
- pr_debug(" Revision 0x%x \n", mc_header->rev);
- pr_debug(" Date %x/%x/%x\n",
- ((mc_header->date >> 24 ) & 0xff),
- ((mc_header->date >> 16 ) & 0xff),
- (mc_header->date & 0xFFFF));
- pr_debug(" Signature 0x%x\n", sig);
- pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
- ((sig >> 12) & 0x3),
- ((sig >> 8) & 0xf),
- ((sig >> 4) & 0xf),
- ((sig & 0xf)));
- pr_debug(" Processor Flags 0x%x\n", pf);
- pr_debug(" Checksum 0x%x\n", cksum);
-
- if (mc_header->rev < uci->rev) {
- printk(KERN_ERR "microcode: CPU%d not 'upgrading' to earlier
revision"
- " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev,
uci->rev);
- goto out;
- } else if (mc_header->rev == uci->rev) {
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- printk(KERN_ERR "microcode: CPU%d already at revision"
- " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev,
uci->rev);
- goto out;
- }
-
- pr_debug("microcode: CPU%d found a matching microcode update with "
- " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev,
uci->rev);
- uci->cksum = cksum;
- uci->pf = pf; /* keep the original mc pf for cksum calculation */
- uci->err = MC_MARKED; /* found the match */
-out:
- return;
-}
-
-static int find_matching_ucodes (void)
-{
- int cursor = 0;
- int error = 0;
-
- while (cursor + MC_HEADER_SIZE < user_buffer_size) {
- microcode_header_t mc_header;
- void *newmc = NULL;
- int i, sum, cpu_num, allocated_flag, total_size, data_size,
ext_table_size;
-
- if (copy_from_user(&mc_header, user_buffer + cursor,
MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user
data\n");
- error = -EFAULT;
- goto out;
- }
-
- total_size = get_totalsize(&mc_header);
- if ((cursor + total_size > user_buffer_size) || (total_size <
DEFAULT_UCODE_TOTALSIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in
microcode data file\n");
- error = -EINVAL;
- goto out;
- }
-
- data_size = get_datasize(&mc_header);
- if ((data_size + MC_HEADER_SIZE > total_size) || (data_size <
DEFAULT_UCODE_DATASIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in
microcode data file\n");
- error = -EINVAL;
- goto out;
- }
-
- if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
- printk(KERN_ERR "microcode: error! Unknown microcode
update format\n");
- error = -EINVAL;
- goto out;
- }
-
- for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (uci->err != MC_NOTFOUND) /* already found a match
or not an online cpu*/
- continue;
-
- if (sigmatch(mc_header.sig, uci->sig, mc_header.pf,
uci->pf))
- mark_microcode_update(cpu_num, &mc_header,
mc_header.sig, mc_header.pf, mc_header.cksum);
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- struct extended_sigtable ext_header;
- struct extended_signature ext_sig;
- int ext_sigcount;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE)
% EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in
microcode data file\n");
- error = -EINVAL;
- goto out;
- }
- if (copy_from_user(&ext_header, user_buffer + cursor
- + MC_HEADER_SIZE + data_size,
EXT_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read
user data\n");
- error = -EFAULT;
- goto out;
- }
- if (ext_table_size != exttable_size(&ext_header)) {
- printk(KERN_ERR "microcode: error! Bad data in
microcode data file\n");
- error = -EFAULT;
- goto out;
- }
-
- ext_sigcount = ext_header.count;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (copy_from_user(&ext_sig, user_buffer +
cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE
- + EXT_SIGNATURE_SIZE * i,
EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Can
not read user data\n");
- error = -EFAULT;
- goto out;
- }
- for (cpu_num = 0; cpu_num < num_online_cpus();
cpu_num++) {
- struct ucode_cpu_info *uci =
ucode_cpu_info + cpu_num;
- if (uci->err != MC_NOTFOUND) /* already
found a match or not an online cpu*/
- continue;
- if (sigmatch(ext_sig.sig, uci->sig,
ext_sig.pf, uci->pf)) {
- mark_microcode_update(cpu_num,
&mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
- }
- }
- }
- }
- /* now check if any cpu has matched */
- for (cpu_num = 0, allocated_flag = 0, sum = 0; cpu_num <
num_online_cpus(); cpu_num++) {
- if (ucode_cpu_info[cpu_num].err == MC_MARKED) {
- struct ucode_cpu_info *uci = ucode_cpu_info +
cpu_num;
- if (!allocated_flag) {
- allocated_flag = 1;
- newmc = vmalloc(total_size);
- if (!newmc) {
- printk(KERN_ERR "microcode:
error! Can not allocate memory\n");
- error = -ENOMEM;
- goto out;
- }
- if (copy_from_user(newmc +
MC_HEADER_SIZE,
- user_buffer +
cursor + MC_HEADER_SIZE,
- total_size -
MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode:
error! Can not read user data\n");
- vfree(newmc);
- error = -EFAULT;
- goto out;
- }
- memcpy(newmc, &mc_header,
MC_HEADER_SIZE);
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int * ext_tablep = (((void *)
newmc) + MC_HEADER_SIZE + data_size);
- i = ext_table_size / DWSIZE;
- while (i--) ext_table_sum +=
ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING
"microcode: aborting, bad extended signature table checksum\n");
- vfree(newmc);
- error = -EINVAL;
- goto out;
- }
- }
-
- /* calculate the checksum */
- i = (MC_HEADER_SIZE + data_size) /
DWSIZE;
- while (i--) sum += ((int *)newmc)[i];
- sum -= (mc_header.sig + mc_header.pf +
mc_header.cksum);
- }
- ucode_cpu_info[cpu_num].mc = newmc;
- ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /*
mc updated */
- if (sum + uci->sig + uci->pf + uci->cksum != 0)
{
- printk(KERN_ERR "microcode: CPU%d
aborting, bad checksum\n", cpu_num);
- error = -EINVAL;
- goto out;
- }
- }
- }
- cursor += total_size; /* goto the next update patch */
- } /* end of while */
-out:
- return error;
-}
-
-static void do_update_one (void * unused)
-{
- unsigned long flags;
- unsigned int val[2];
- int cpu_num = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (uci->mc == NULL) {
- printk(KERN_INFO "microcode: No new microcode data for
CPU%d\n", cpu_num);
- return;
- }
-
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(µcode_update_lock, flags);
-
- /* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) uci->mc->bits,
- (unsigned long) uci->mc->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- spin_unlock_irqrestore(µcode_update_lock, flags);
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %08x \n",
- cpu_num, uci->rev, val[1], uci->mc->hdr.date);
- return;
-}
static int do_microcode_update (void)
{
- int i, error;
+ int err;
+ dom0_op_t op;
- if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all
processors\n");
- error = -EIO;
- goto out;
- }
+ err = sys_mlock((unsigned long)user_buffer, user_buffer_size);
+ if (err != 0)
+ return err;
- if ((error = find_matching_ucodes())) {
- printk(KERN_ERR "microcode: Error in the microcode data\n");
- goto out_free;
- }
+ op.cmd = DOM0_MICROCODE;
+ op.u.microcode.data = user_buffer;
+ op.u.microcode.length = user_buffer_size;
+ err = HYPERVISOR_dom0_op(&op);
- if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all
processors\n");
- error = -EIO;
- }
+ (void)sys_munlock((unsigned long)user_buffer, user_buffer_size);
-out_free:
- for (i = 0; i < num_online_cpus(); i++) {
- if (ucode_cpu_info[i].mc) {
- int j;
- void *tmp = ucode_cpu_info[i].mc;
- vfree(tmp);
- for (j = i; j < num_online_cpus(); j++) {
- if (ucode_cpu_info[j].mc == tmp)
- ucode_cpu_info[j].mc = NULL;
- }
- }
- }
-out:
- return error;
+ return err;
}
static ssize_t microcode_write (struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/mpparse.c
linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c
--- pristine-linux-2.6.12/arch/i386/kernel/mpparse.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c 2005-07-28
13:17:07.000000000 -0700
@@ -109,7 +109,7 @@ static int MP_valid_apicid(int apicid, i
{
return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
}
-#else
+#elif !defined(CONFIG_XEN)
static int MP_valid_apicid(int apicid, int version)
{
if (version >= 0x14)
@@ -119,6 +119,7 @@ static int MP_valid_apicid(int apicid, i
}
#endif
+#ifndef CONFIG_XEN
static void __init MP_processor_info (struct mpc_config_processor *m)
{
int ver, apicid;
@@ -217,6 +218,12 @@ static void __init MP_processor_info (st
apic_version[m->mpc_apicid] = ver;
bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
}
+#else
+void __init MP_processor_info (struct mpc_config_processor *m)
+{
+ num_processors++;
+}
+#endif /* CONFIG_XEN */
static void __init MP_bus_info (struct mpc_config_bus *m)
{
@@ -690,7 +697,7 @@ void __init get_smp_config (void)
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc((void *)mpf->mpf_physptr)) {
+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
smp_found_config = 0;
printk(KERN_ERR "BIOS bug, MP table errors
detected!...\n");
printk(KERN_ERR "... disabling SMP support. (tell your
hw vendor)\n");
@@ -725,7 +732,7 @@ void __init get_smp_config (void)
static int __init smp_scan_config (unsigned long base, unsigned long length)
{
- unsigned long *bp = phys_to_virt(base);
+ unsigned long *bp = isa_bus_to_virt(base);
struct intel_mp_floating *mpf;
Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
@@ -741,6 +748,7 @@ static int __init smp_scan_config (unsig
|| (mpf->mpf_specification == 4)) ) {
smp_found_config = 1;
+#ifndef CONFIG_XEN
printk(KERN_INFO "found SMP MP-table at %08lx\n",
virt_to_phys(mpf));
reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
@@ -760,6 +768,10 @@ static int __init smp_scan_config (unsig
size = end - mpf->mpf_physptr;
reserve_bootmem(mpf->mpf_physptr, size);
}
+#else
+ printk(KERN_INFO "found SMP MP-table at %08lx\n",
+ ((unsigned long)bp - (unsigned
long)isa_bus_to_virt(base)) + base);
+#endif
mpf_found = mpf;
return 1;
@@ -803,9 +815,11 @@ void __init find_smp_config (void)
* MP1.4 SPEC states to only scan first 1K of 4K EBDA.
*/
+#ifndef CONFIG_XEN
address = get_bios_ebda();
if (address)
smp_scan_config(address, 0x400);
+#endif
}
/* --------------------------------------------------------------------------
@@ -817,14 +831,14 @@ void __init find_smp_config (void)
void __init mp_register_lapic_address (
u64 address)
{
+#ifndef CONFIG_XEN
mp_lapic_addr = (unsigned long) address;
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
if (boot_cpu_physical_apicid == -1U)
boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+#endif
}
@@ -844,6 +858,7 @@ void __init mp_register_lapic (
if (id == boot_cpu_physical_apicid)
boot_cpu = 1;
+#ifndef CONFIG_XEN
processor.mpc_type = MP_PROCESSOR;
processor.mpc_apicid = id;
processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
@@ -854,6 +869,7 @@ void __init mp_register_lapic (
processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
+#endif
MP_processor_info(&processor);
}
@@ -913,7 +929,6 @@ void __init mp_register_ioapic (
mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
mp_ioapics[idx].mpc_apicaddr = address;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c
linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c
--- pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c 2005-07-28
13:17:07.000000000 -0700
@@ -11,7 +11,10 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/pci.h>
+#include <linux/version.h>
#include <asm/io.h>
+#include <asm-xen/balloon.h>
+#include <asm/tlbflush.h>
struct dma_coherent_mem {
void *virt_base;
@@ -26,7 +29,8 @@ void *dma_alloc_coherent(struct device *
{
void *ret;
struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
- int order = get_order(size);
+ unsigned int order = get_order(size);
+ unsigned long vstart;
/* ignore region specifiers */
gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
@@ -46,11 +50,14 @@ void *dma_alloc_coherent(struct device *
if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
gfp |= GFP_DMA;
- ret = (void *)__get_free_pages(gfp, order);
+ vstart = __get_free_pages(gfp, order);
+ ret = (void *)vstart;
if (ret != NULL) {
+ xen_contig_memory(vstart, order);
+
memset(ret, 0, size);
- *dma_handle = virt_to_phys(ret);
+ *dma_handle = virt_to_bus(ret);
}
return ret;
}
@@ -145,3 +152,131 @@ void *dma_mark_declared_memory_occupied(
return mem->virt_base + (pos << PAGE_SHIFT);
}
EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static LIST_HEAD(dma_map_head);
+static DEFINE_SPINLOCK(dma_map_lock);
+struct dma_map_entry {
+ struct list_head list;
+ dma_addr_t dma;
+ char *bounce, *host;
+ size_t size;
+};
+#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d)))
+
+dma_addr_t
+dma_map_single(struct device *dev, void *ptr, size_t size,
+ enum dma_data_direction direction)
+{
+ struct dma_map_entry *ent;
+ void *bnc;
+ dma_addr_t dma;
+ unsigned long flags;
+
+ BUG_ON(direction == DMA_NONE);
+
+ /*
+ * Even if size is sub-page, the buffer may still straddle a page
+ * boundary. Take into account buffer start offset. All other calls are
+ * conservative and always search the dma_map list if it's non-empty.
+ */
+ if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) {
+ dma = virt_to_bus(ptr);
+ } else {
+ BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL);
+ BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL);
+ if (direction != DMA_FROM_DEVICE)
+ memcpy(bnc, ptr, size);
+ ent->dma = dma;
+ ent->bounce = bnc;
+ ent->host = ptr;
+ ent->size = size;
+ spin_lock_irqsave(&dma_map_lock, flags);
+ list_add(&ent->list, &dma_map_head);
+ spin_unlock_irqrestore(&dma_map_lock, flags);
+ }
+
+ flush_write_buffers();
+ return dma;
+}
+EXPORT_SYMBOL(dma_map_single);
+
+void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ struct dma_map_entry *ent;
+ unsigned long flags;
+
+ BUG_ON(direction == DMA_NONE);
+
+ /* Fast-path check: are there any multi-page DMA mappings? */
+ if (!list_empty(&dma_map_head)) {
+ spin_lock_irqsave(&dma_map_lock, flags);
+ list_for_each_entry ( ent, &dma_map_head, list ) {
+ if (DMA_MAP_MATCHES(ent, dma_addr)) {
+ list_del(&ent->list);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dma_map_lock, flags);
+ if (&ent->list != &dma_map_head) {
+ BUG_ON(dma_addr != ent->dma);
+ BUG_ON(size != ent->size);
+ if (direction != DMA_TO_DEVICE)
+ memcpy(ent->host, ent->bounce, size);
+ dma_free_coherent(dev, size, ent->bounce, ent->dma);
+ kfree(ent);
+ }
+ }
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
+void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+ enum dma_data_direction direction)
+{
+ struct dma_map_entry *ent;
+ unsigned long flags, off;
+
+ /* Fast-path check: are there any multi-page DMA mappings? */
+ if (!list_empty(&dma_map_head)) {
+ spin_lock_irqsave(&dma_map_lock, flags);
+ list_for_each_entry ( ent, &dma_map_head, list )
+ if (DMA_MAP_MATCHES(ent, dma_handle))
+ break;
+ spin_unlock_irqrestore(&dma_map_lock, flags);
+ if (&ent->list != &dma_map_head) {
+ off = dma_handle - ent->dma;
+ BUG_ON((off + size) > ent->size);
+ /*if (direction != DMA_TO_DEVICE)*/
+ memcpy(ent->host+off, ent->bounce+off, size);
+ }
+ }
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t
size,
+ enum dma_data_direction direction)
+{
+ struct dma_map_entry *ent;
+ unsigned long flags, off;
+
+ /* Fast-path check: are there any multi-page DMA mappings? */
+ if (!list_empty(&dma_map_head)) {
+ spin_lock_irqsave(&dma_map_lock, flags);
+ list_for_each_entry ( ent, &dma_map_head, list )
+ if (DMA_MAP_MATCHES(ent, dma_handle))
+ break;
+ spin_unlock_irqrestore(&dma_map_lock, flags);
+ if (&ent->list != &dma_map_head) {
+ off = dma_handle - ent->dma;
+ BUG_ON((off + size) > ent->size);
+ /*if (direction != DMA_FROM_DEVICE)*/
+ memcpy(ent->bounce+off, ent->host+off, size);
+ }
+ }
+
+ flush_write_buffers();
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/process.c
linux-2.6-xen-sparse/arch/i386/kernel/process.c
--- pristine-linux-2.6.12/arch/i386/kernel/process.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/process.c 2005-07-28
13:17:07.000000000 -0700
@@ -13,6 +13,7 @@
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -47,6 +48,7 @@
#include <asm/i387.h>
#include <asm/irq.h>
#include <asm/desc.h>
+#include <asm-xen/xen-public/physdev.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
#endif
@@ -54,6 +56,9 @@
#include <linux/irq.h>
#include <linux/err.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
static int hlt_counter;
@@ -89,54 +94,48 @@ void enable_hlt(void)
EXPORT_SYMBOL(enable_hlt);
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
+/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
+extern void stop_hz_timer(void);
+extern void start_hz_timer(void);
+void xen_idle(void)
{
- if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- local_irq_disable();
- if (!need_resched())
- safe_halt();
- else
- local_irq_enable();
+ local_irq_disable();
+
+ if (need_resched()) {
+ local_irq_enable();
} else {
- cpu_relax();
+ stop_hz_timer();
+ HYPERVISOR_block(); /* implicit local_irq_enable() */
+ start_hz_timer();
}
}
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle (void)
-{
- int oldval;
-
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ /* We shouldn't have to disable interrupts while dead, but
+ * some interrupts just don't seem to go away, and this makes
+ * it "work" for testing purposes. */
+ /* Death loop */
+ while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+ HYPERVISOR_yield();
+
+ local_irq_disable();
+ __flush_tlb_all();
+ cpu_set(smp_processor_id(), cpu_online_map);
local_irq_enable();
-
- /*
- * Deal with another CPU just having chosen a thread to
- * run here:
- */
- oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
- if (!oldval) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- asm volatile(
- "2:"
- "testl %0, %1;"
- "rep; nop;"
- "je 2b;"
- : : "i"(_TIF_NEED_RESCHED), "m"
(current_thread_info()->flags));
-
- clear_thread_flag(TIF_POLLING_NRFLAG);
- } else {
- set_need_resched();
- }
}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
/*
* The idle thread. There's no useful work to be
@@ -146,22 +145,26 @@ static void poll_idle (void)
*/
void cpu_idle (void)
{
+ int cpu = _smp_processor_id();
+
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
- void (*idle)(void);
if (__get_cpu_var(cpu_idle_state))
__get_cpu_var(cpu_idle_state) = 0;
-
rmb();
- idle = pm_idle;
- if (!idle)
- idle = default_idle;
+ if (cpu_is_offline(cpu)) {
+#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU)
+ /* Tell hypervisor to take vcpu down. */
+ HYPERVISOR_vcpu_down(cpu);
+#endif
+ play_dead();
+ }
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
- idle();
+ xen_idle();
}
schedule();
}
@@ -195,74 +198,18 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- */
-static void mwait_idle(void)
-{
- local_irq_enable();
-
- if (!need_resched()) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- do {
- __monitor((void *)¤t_thread_info()->flags, 0, 0);
- if (need_resched())
- break;
- __mwait(0, 0);
- } while (!need_resched());
- clear_thread_flag(TIF_POLLING_NRFLAG);
- }
-}
-
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_MWAIT)) {
- printk("monitor/mwait feature present.\n");
- /*
- * Skip, if setup has overridden idle.
- * One CPU supports mwait => All CPUs supports mwait
- */
- if (!pm_idle) {
- printk("using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
-}
-
-static int __init idle_setup (char *str)
-{
- if (!strncmp(str, "poll", 4)) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
-#ifdef CONFIG_X86_SMP
- if (smp_num_siblings > 1)
- printk("WARNING: polling idle and HT enabled,
performance may degrade.\n");
-#endif
- } else if (!strncmp(str, "halt", 4)) {
- printk("using halt in idle threads.\n");
- pm_idle = default_idle;
- }
-
- boot_option_idle_override = 1;
- return 1;
-}
-
-__setup("idle=", idle_setup);
+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
+/* Always use xen_idle() instead. */
+void __init select_idle_routine(const struct cpuinfo_x86 *c) {}
void show_regs(struct pt_regs * regs)
{
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
-
printk("\n");
printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip,
smp_processor_id());
print_symbol("EIP is at %s\n", regs->eip);
- if (regs->xcs & 3)
+ if (regs->xcs & 2)
printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
printk(" EFLAGS: %08lx %s (%s)\n",
regs->eflags, print_tainted(), system_utsname.release);
@@ -273,17 +220,6 @@ void show_regs(struct pt_regs * regs)
printk(" DS: %04x ES: %04x\n",
0xffff & regs->xds,0xffff & regs->xes);
- __asm__("movl %%cr0, %0": "=r" (cr0));
- __asm__("movl %%cr2, %0": "=r" (cr2));
- __asm__("movl %%cr3, %0": "=r" (cr3));
- /* This could fault if %cr4 does not exist */
- __asm__("1: movl %%cr4, %0 \n"
- "2: \n"
- ".section __ex_table,\"a\" \n"
- ".long 1b,2b \n"
- ".previous \n"
- : "=r" (cr4): "0" (0));
- printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3,
cr4);
show_trace(NULL, ®s->esp);
}
@@ -336,20 +272,11 @@ void exit_thread(void)
/* The process may have allocated an io port bitmap... nuke it. */
if (unlikely(NULL != t->io_bitmap_ptr)) {
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
+ physdev_op_t op = { 0 };
+ op.cmd = PHYSDEVOP_SET_IOBITMAP;
+ HYPERVISOR_physdev_op(&op);
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
- t->io_bitmap_max = 0;
- tss->io_bitmap_owner = NULL;
- tss->io_bitmap_max = 0;
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
- put_cpu();
}
}
@@ -458,6 +385,8 @@ int copy_thread(int nr, unsigned long cl
desc->b = LDT_entry_b(&info);
}
+ p->thread.io_pl = current->thread.io_pl;
+
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
@@ -525,40 +454,10 @@ int dump_task_regs(struct task_struct *t
elf_core_copy_regs(regs, &ptregs);
+ boot_option_idle_override = 1;
return 1;
}
-static inline void
-handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
-{
- if (!next->io_bitmap_ptr) {
- /*
- * Disable the bitmap via an invalid offset. We still cache
- * the previous bitmap owner and the IO bitmap contents:
- */
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
- return;
- }
- if (likely(next == tss->io_bitmap_owner)) {
- /*
- * Previous owner of the bitmap (hence the bitmap content)
- * matches the next task, we dont have to do anything but
- * to set a valid offset in the TSS:
- */
- tss->io_bitmap_base = IO_BITMAP_OFFSET;
- return;
- }
- /*
- * Lazy TSS's I/O bitmap copy. We set an invalid offset here
- * and we let the task to get a GPF in case an I/O instruction
- * is performed. The handler of the GPF will verify that the
- * faulting task has a valid I/O bitmap and, it true, does the
- * real copy and restart the instruction. This will save us
- * redundant copies when the currently switched task does not
- * perform any I/O during its timeslice.
- */
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
/*
* switch_to(x,yn) should switch tasks from x to y.
@@ -593,32 +492,77 @@ struct task_struct fastcall * __switch_t
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ physdev_op_t iopl_op, iobmp_op;
+ multicall_entry_t _mcl[8], *mcl = _mcl;
- /* never put a printk in __switch_to... printk() calls wake_up*()
indirectly */
+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
- __unlazy_fpu(prev_p);
+ /*
+ * This is basically '__unlazy_fpu', except that we queue a
+ * multicall to indicate FPU task switch, rather than
+ * synchronously trapping to Xen.
+ */
+ if (prev_p->thread_info->status & TS_USEDFPU) {
+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = 1;
+ mcl++;
+ }
/*
* Reload esp0, LDT and the page table pointer:
+ * This is load_esp0(tss, next) with a multicall.
*/
- load_esp0(tss, next);
+ tss->esp0 = next->esp0;
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = tss->ss0;
+ mcl->args[1] = tss->esp0;
+ mcl++;
/*
* Load the per-thread Thread-Local Storage descriptor.
+ * This is load_TLS(next, cpu) with multicalls.
*/
- load_TLS(next, cpu);
+#define C(i) do { \
+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
+ next->tls_array[i].b != prev->tls_array[i].b)) { \
+ mcl->op = __HYPERVISOR_update_descriptor; \
+ mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \
+ [GDT_ENTRY_TLS_MIN + i]); \
+ mcl->args[1] = ((u32 *)&next->tls_array[i])[0]; \
+ mcl->args[2] = ((u32 *)&next->tls_array[i])[1]; \
+ mcl++; \
+ } \
+} while (0)
+ C(0); C(1); C(2);
+#undef C
+
+ if (unlikely(prev->io_pl != next->io_pl)) {
+ iopl_op.cmd = PHYSDEVOP_SET_IOPL;
+ iopl_op.u.set_iopl.iopl = next->io_pl;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = (unsigned long)&iopl_op;
+ mcl++;
+ }
+
+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+ iobmp_op.cmd =
+ PHYSDEVOP_SET_IOBITMAP;
+ iobmp_op.u.set_iobitmap.bitmap =
+ (unsigned long)next->io_bitmap_ptr;
+ iobmp_op.u.set_iobitmap.nr_ports =
+ next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+ mcl->op = __HYPERVISOR_physdev_op;
+ mcl->args[0] = (unsigned long)&iobmp_op;
+ mcl++;
+ }
- /*
- * Save away %fs and %gs. No need to save %es and %ds, as
- * those are always kernel segments while inside the kernel.
- */
- asm volatile("mov %%fs,%0":"=m" (prev->fs));
- asm volatile("mov %%gs,%0":"=m" (prev->gs));
+ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
/*
* Restore %fs and %gs if needed.
*/
- if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
+ if (unlikely(next->fs | next->gs)) {
loadsegment(fs, next->fs);
loadsegment(gs, next->gs);
}
@@ -636,9 +580,6 @@ struct task_struct fastcall * __switch_t
loaddebug(next, 7);
}
- if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
- handle_io_bitmap(next, tss);
-
return prev_p;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/quirks.c
linux-2.6-xen-sparse/arch/i386/kernel/quirks.c
--- pristine-linux-2.6.12/arch/i386/kernel/quirks.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/quirks.c 2005-07-28
13:17:07.000000000 -0700
@@ -32,14 +32,11 @@ static void __devinit quirk_intel_irqbal
raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
if (!(word & (1 << 13))) {
+ dom0_op_t op;
printk(KERN_INFO "Disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
- irqbalance_disable("");
-#endif
- noirqdebug_setup("");
-#ifdef CONFIG_PROC_FS
- no_irq_affinity = 1;
-#endif
+ op.cmd = DOM0_PLATFORM_QUIRK;
+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
+ (void)HYPERVISOR_dom0_op(&op);
}
config &= ~0x2;
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/setup.c
linux-2.6-xen-sparse/arch/i386/kernel/setup.c
--- pristine-linux-2.6.12/arch/i386/kernel/setup.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/setup.c 2005-07-28
13:17:07.000000000 -0700
@@ -41,6 +41,9 @@
#include <linux/init.h>
#include <linux/edd.h>
#include <linux/nodemask.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
#include <video/edid.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
@@ -50,13 +53,18 @@
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/physdev.h>
#include "setup_arch_pre.h"
#include <bios_ebda.h>
-/* This value is set up by the early boot code to point to the value
- immediately after the boot time page tables. It contains a *physical*
- address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_end __initdata = ~0UL;
+/* Allows setting of maximum possible memory size */
+static unsigned long xen_override_max_pfn;
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+ xen_panic_event, NULL, 0 /* try to go last */
+};
int disable_pse __initdata = 0;
@@ -70,9 +78,9 @@ EXPORT_SYMBOL(efi_enabled);
#endif
/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
unsigned long mmu_cr4_features;
@@ -146,6 +154,7 @@ static struct resource code_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
static struct resource system_rom_resource = {
.name = "System ROM",
.start = 0xf0000,
@@ -201,6 +210,7 @@ static struct resource video_rom_resourc
.end = 0xc7fff,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+#endif
static struct resource video_ram_resource = {
.name = "Video RAM area",
@@ -259,6 +269,7 @@ static struct resource standard_io_resou
#define STANDARD_IO_RESOURCES \
(sizeof standard_io_resources / sizeof standard_io_resources[0])
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
static int __init romchecksum(unsigned char *rom, unsigned long length)
@@ -276,6 +287,10 @@ static void __init probe_roms(void)
unsigned char *rom;
int i;
+ /* Nothing to do if not running in dom0. */
+ if (!(xen_start_info.flags & SIF_INITDOMAIN))
+ return;
+
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
@@ -334,6 +349,20 @@ static void __init probe_roms(void)
start = adapter_rom_resources[i++].end & ~2047UL;
}
}
+#endif
+
+/*
+ * Point at the empty zero page to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/* Raw start-of-day parameters from the hypervisor. */
+union xen_start_info_union xen_start_info_union;
static void __init limit_regions(unsigned long long size)
{
@@ -414,6 +443,7 @@ static void __init print_memory_map(char
}
}
+#if 0
/*
* Sanitize the BIOS e820 map.
*
@@ -633,6 +663,7 @@ static int __init copy_e820_map(struct e
} while (biosmap++,--nr_map);
return 0;
}
+#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -666,11 +697,14 @@ static inline void copy_edd(void)
static void __init parse_cmdline_early (char ** cmdline_p)
{
char c = ' ', *to = command_line, *from = saved_command_line;
- int len = 0;
+ int len = 0, max_cmdline;
int userdef = 0;
+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+ max_cmdline = COMMAND_LINE_SIZE;
+ memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline);
/* Save unparsed command line copy for /proc/cmdline */
- saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+ saved_command_line[max_cmdline-1] = '\0';
for (;;) {
if (c != ' ')
@@ -702,8 +736,13 @@ static void __init parse_cmdline_early (
unsigned long long mem_size;
mem_size = memparse(from+4, &from);
+#if 0
limit_regions(mem_size);
userdef=1;
+#else
+ xen_override_max_pfn =
+ (unsigned long)(mem_size>>PAGE_SHIFT);
+#endif
}
}
@@ -744,7 +783,7 @@ static void __init parse_cmdline_early (
noexec_setup(from + 7);
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_X86_MPPARSE
/*
* If the BIOS enumerates physical processors before logical,
* maxcpus=N at enumeration-time can be used to disable HT.
@@ -846,6 +885,7 @@ static void __init parse_cmdline_early (
}
}
+#if 0 /* !XEN */
/*
* Callback for efi_memory_walk.
*/
@@ -889,6 +929,15 @@ void __init find_max_pfn(void)
max_pfn = end;
}
}
+#else
+/* We don't use the fake e820 because we need to respond to user override. */
+void __init find_max_pfn(void)
+{
+ if ( xen_override_max_pfn < xen_start_info.nr_pages )
+ xen_override_max_pfn = xen_start_info.nr_pages;
+ max_pfn = xen_override_max_pfn;
+}
+#endif /* XEN */
/*
* Determine low and high memory ranges:
@@ -1011,6 +1060,7 @@ static void __init register_bootmem_low_
}
}
+#ifndef CONFIG_XEN
/*
* workaround for Dell systems that neglect to reserve EBDA
*/
@@ -1021,16 +1071,18 @@ static void __init reserve_ebda_region(v
if (addr)
reserve_bootmem(addr, PAGE_SIZE);
}
+#endif
#ifndef CONFIG_DISCONTIGMEM
void __init setup_bootmem_allocator(void);
static unsigned long __init setup_memory(void)
{
+
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
- min_low_pfn = PFN_UP(init_pg_tables_end);
+ min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) +
xen_start_info.nr_pt_frames;
find_max_pfn();
@@ -1057,7 +1109,14 @@ void __init zone_sizes_init(void)
unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
unsigned int max_dma, low;
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ /*
+ * XEN: Our notion of "DMA memory" is fake when running over Xen.
+ * We simply put all RAM in the DMA zone so that those drivers which
+ * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
+ * Those drivers that *do* require lowmem are screwed anyway when
+ * running over Xen!
+ */
+ max_dma = max_low_pfn;
low = max_low_pfn;
if (low < max_dma)
@@ -1095,6 +1154,7 @@ void __init setup_bootmem_allocator(void
reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
+#ifndef CONFIG_XEN
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
@@ -1125,20 +1185,15 @@ void __init setup_bootmem_allocator(void
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#endif
+#endif /* !CONFIG_XEN */
#ifdef CONFIG_BLK_DEV_INITRD
- if (LOADER_TYPE && INITRD_START) {
+ if (xen_start_info.mod_start) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
- reserve_bootmem(INITRD_START, INITRD_SIZE);
- initrd_start =
- INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+ /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
+ initrd_start = INITRD_START + PAGE_OFFSET;
initrd_end = initrd_start+INITRD_SIZE;
+ initrd_below_start_ok = 1;
}
else {
printk(KERN_ERR "initrd extends beyond end of memory "
@@ -1149,6 +1204,8 @@ void __init setup_bootmem_allocator(void
}
}
#endif
+
+ phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list;
}
/*
@@ -1178,7 +1235,9 @@ legacy_init_iomem_resources(struct resou
{
int i;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
probe_roms();
+#endif
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
@@ -1220,8 +1279,9 @@ static void __init register_memory(void)
else
legacy_init_iomem_resources(&code_resource, &data_resource);
- /* EFI systems may still have VGA */
- request_resource(&iomem_resource, &video_ram_resource);
+ if (xen_start_info.flags & SIF_INITDOMAIN)
+ /* EFI systems may still have VGA */
+ request_resource(&iomem_resource, &video_ram_resource);
/* request I/O space for devices used on all i[345]86 PCs */
for (i = 0; i < STANDARD_IO_RESOURCES; i++)
@@ -1396,10 +1456,23 @@ static void set_mca_bus(int x) { }
*/
void __init setup_arch(char **cmdline_p)
{
+ int i, j;
+ physdev_op_t op;
unsigned long max_low_pfn;
+ /* Force a quick death if the kernel panics. */
+ extern int panic_timeout;
+ if (panic_timeout == 0)
+ panic_timeout = 1;
+
+ /* Register a call for panic conditions. */
+ notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_writable_pagetables);
+
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- pre_setup_arch_hook();
early_cpu_init();
/*
@@ -1414,7 +1487,10 @@ void __init setup_arch(char **cmdline_p)
efi_enabled = 1;
#endif
- ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+ */
+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
drive_info = DRIVE_INFO;
screen_info = SCREEN_INFO;
edid_info = EDID_INFO;
@@ -1429,6 +1505,16 @@ void __init setup_arch(char **cmdline_p)
}
bootloader_type = LOADER_TYPE;
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+ /* This is drawn from a dump from vgacon:startup in standard Linux. */
+ screen_info.orig_video_mode = 3;
+ screen_info.orig_video_isVGA = 1;
+ screen_info.orig_video_lines = 25;
+ screen_info.orig_video_cols = 80;
+ screen_info.orig_video_ega_bx = 3;
+ screen_info.orig_video_points = 16;
+#endif
+
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
@@ -1449,12 +1535,14 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+ init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) +
+ xen_start_info.nr_pt_frames) << PAGE_SHIFT;
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
+ /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */
+ /*code_resource.start = virt_to_phys(_text);*/
+ /*code_resource.end = virt_to_phys(_etext)-1;*/
+ /*data_resource.start = virt_to_phys(_etext);*/
+ /*data_resource.end = virt_to_phys(_edata)-1;*/
parse_cmdline_early(cmdline_p);
@@ -1477,6 +1565,51 @@ void __init setup_arch(char **cmdline_p)
remapped_pgdat_init();
zone_sizes_init();
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+
+ /* Make sure we have a correctly sized P->M table. */
+ if (max_pfn != xen_start_info.nr_pages) {
+ phys_to_machine_mapping = alloc_bootmem_low_pages(
+ max_pfn * sizeof(unsigned long));
+
+ if (max_pfn > xen_start_info.nr_pages) {
+ /* set to INVALID_P2M_ENTRY */
+ memset(phys_to_machine_mapping, ~0,
+ max_pfn * sizeof(unsigned long));
+ memcpy(phys_to_machine_mapping,
+ (unsigned long *)xen_start_info.mfn_list,
+ xen_start_info.nr_pages * sizeof(unsigned
long));
+ } else {
+ memcpy(phys_to_machine_mapping,
+ (unsigned long *)xen_start_info.mfn_list,
+ max_pfn * sizeof(unsigned long));
+ if (HYPERVISOR_dom_mem_op(
+ MEMOP_decrease_reservation,
+ (unsigned long *)xen_start_info.mfn_list +
max_pfn,
+ xen_start_info.nr_pages - max_pfn, 0) !=
+ (xen_start_info.nr_pages - max_pfn)) BUG();
+ }
+ free_bootmem(
+ __pa(xen_start_info.mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info.nr_pages *
+ sizeof(unsigned long))));
+ }
+
+ pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE);
+ for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+ {
+ pfn_to_mfn_frame_list[j] =
+ virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
+ }
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list =
+ virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
+
+
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
@@ -1502,6 +1635,18 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled)
efi_map_memmap();
+ op.cmd = PHYSDEVOP_SET_IOPL;
+ op.u.set_iopl.iopl = current->thread.io_pl = 1;
+ HYPERVISOR_physdev_op(&op);
+
+#ifdef CONFIG_ACPI_BOOT
+ if (!(xen_start_info.flags & SIF_INITDOMAIN)) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ acpi_disabled = 1;
+ acpi_ht = 0;
+ }
+#endif
+
#ifdef CONFIG_ACPI_BOOT
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
@@ -1515,16 +1660,46 @@ void __init setup_arch(char **cmdline_p)
get_smp_config();
#endif
+ /* XXX Disable irqdebug until we have a way to avoid interrupt
+ * conflicts. */
+ noirqdebug_setup("");
+
register_memory();
+ if (xen_start_info.flags & SIF_INITDOMAIN) {
+ if (!(xen_start_info.flags & SIF_PRIVILEGED))
+ panic("Xen granted us console access "
+ "but not privileged status");
+
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
+ if (!efi_enabled ||
+ (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
+ conswitchp = &dummy_con;
+#endif
#endif
+ } else {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ extern const struct consw xennull_con;
+ extern int console_use_vt;
+#if defined(CONFIG_VGA_CONSOLE)
+ /* disable VGA driver */
+ ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB;
#endif
+ conswitchp = &xennull_con;
+ console_use_vt = 0;
+#endif
+ }
+}
+
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ HYPERVISOR_crash();
+ /* we're never actually going to get here... */
+ return NOTIFY_DONE;
}
#include "setup_arch_post.h"
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/signal.c
linux-2.6-xen-sparse/arch/i386/kernel/signal.c
--- pristine-linux-2.6.12/arch/i386/kernel/signal.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/signal.c 2005-08-02
00:59:44.000000000 -0700
@@ -599,7 +599,7 @@ int fastcall do_signal(struct pt_regs *r
* kernel mode. Just return without doing anything
* if so.
*/
- if ((regs->xcs & 3) != 3)
+ if ((regs->xcs & 2) != 2)
return 1;
if (current->flags & PF_FREEZE) {
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/smpboot.c
linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c
--- pristine-linux-2.6.12/arch/i386/kernel/smpboot.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c 2005-07-28
13:17:07.000000000 -0700
@@ -44,6 +44,9 @@
#include <linux/smp_lock.h>
#include <linux/irq.h>
#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/mc146818rtc.h>
@@ -51,7 +54,11 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
-#include <mach_apic.h>
+#include <asm/smp_alt.h>
+
+#ifndef CONFIG_X86_IO_APIC
+#define Dprintk(args...)
+#endif
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
@@ -79,6 +86,7 @@ u8 x86_cpu_to_apicid[NR_CPUS] =
{ [0 ... NR_CPUS-1] = 0xff };
EXPORT_SYMBOL(x86_cpu_to_apicid);
+#if 0
/*
* Trampoline 80x86 program as an array.
*/
@@ -87,9 +95,19 @@ extern unsigned char trampoline_data [];
extern unsigned char trampoline_end [];
static unsigned char *trampoline_base;
static int trampoline_exec;
+#endif
-static void map_cpu_to_logical_apicid(void);
+#ifdef CONFIG_HOTPLUG_CPU
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#endif
+
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+static char resched_name[NR_CPUS][15];
+static char callfunc_name[NR_CPUS][15];
+#if 0
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -101,6 +119,9 @@ static unsigned long __init setup_trampo
memcpy(trampoline_base, trampoline_data, trampoline_end -
trampoline_data);
return virt_to_phys(trampoline_base);
}
+#endif
+
+static void map_cpu_to_logical_apicid(void);
/*
* We are called very early to get the low memory for the
@@ -108,6 +129,15 @@ static unsigned long __init setup_trampo
*/
void __init smp_alloc_memory(void)
{
+#if 1
+ int cpu;
+
+ for (cpu = 1; cpu < NR_CPUS; cpu++) {
+ cpu_gdt_descr[cpu].address = (unsigned long)
+ alloc_bootmem_low_pages(PAGE_SIZE);
+ /* XXX free unused pages later */
+ }
+#else
trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
/*
* Has to be in very low memory so we can execute
@@ -119,6 +149,7 @@ void __init smp_alloc_memory(void)
* Make the SMP trampoline executable:
*/
trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+#endif
}
/*
@@ -179,6 +210,7 @@ valid_k7:
;
}
+#if 0
/*
* TSC synchronization.
*
@@ -315,6 +347,7 @@ static void __init synchronize_tsc_ap (v
}
}
#undef NR_LOOPS
+#endif
extern void calibrate_delay(void);
@@ -325,6 +358,7 @@ static void __init smp_callin(void)
int cpuid, phys_id;
unsigned long timeout;
+#if 0
/*
* If waken up by an INIT in an 82489DX configuration
* we may get here before an INIT-deassert IPI reaches
@@ -332,11 +366,12 @@ static void __init smp_callin(void)
* lock up on an APIC access.
*/
wait_for_init_deassert(&init_deasserted);
+#endif
/*
* (This works even if the APIC is not enabled.)
*/
- phys_id = GET_APIC_ID(apic_read(APIC_ID));
+ phys_id = smp_processor_id();
cpuid = smp_processor_id();
if (cpu_isset(cpuid, cpu_callin_map)) {
printk("huh, phys CPU#%d, CPU#%d already present??\n",
@@ -372,6 +407,7 @@ static void __init smp_callin(void)
BUG();
}
+#if 0
/*
* the boot CPU has finished the init stage and is spinning
* on callin_map until we finish. We are free to set up this
@@ -382,6 +418,7 @@ static void __init smp_callin(void)
Dprintk("CALLIN, before setup_local_APIC().\n");
smp_callin_clear_local_apic();
setup_local_APIC();
+#endif
map_cpu_to_logical_apicid();
/*
@@ -395,22 +432,49 @@ static void __init smp_callin(void)
*/
smp_store_cpu_info(cpuid);
+#if 0
disable_APIC_timer();
+#endif
/*
* Allow the master to continue.
*/
cpu_set(cpuid, cpu_callin_map);
+#if 0
/*
* Synchronize the TSC with the BP
*/
if (cpu_has_tsc && cpu_khz)
synchronize_tsc_ap();
+#endif
}
static int cpucount;
+
+static irqreturn_t ldebug_interrupt(
+ int irq, void *dev_id, struct pt_regs *regs)
+{
+ return IRQ_HANDLED;
+}
+
+static DEFINE_PER_CPU(int, ldebug_irq);
+static char ldebug_name[NR_CPUS][15];
+
+void ldebug_setup(void)
+{
+ int cpu = smp_processor_id();
+
+ per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG);
+ sprintf(ldebug_name[cpu], "ldebug%d", cpu);
+ BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt,
+ SA_INTERRUPT, ldebug_name[cpu], NULL));
+}
+
+
+extern void local_setup_timer(void);
+
/*
* Activate a secondary processor.
*/
@@ -425,13 +489,10 @@ static void __init start_secondary(void
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
rep_nop();
- setup_secondary_APIC_clock();
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- enable_NMI_through_LVT0(NULL);
- enable_8259A_irq(0);
- }
- enable_APIC_timer();
+ local_setup_timer();
+ ldebug_setup();
+ smp_intr_init();
+ local_irq_enable();
/*
* low-memory mappings have been cleared, flush them from
* the local TLBs too.
@@ -510,7 +571,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] = { [0
static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
- int apicid = logical_smp_processor_id();
+ int apicid = smp_processor_id();
cpu_2_logical_apicid[cpu] = apicid;
map_cpu_to_node(cpu, apicid_to_node(apicid));
@@ -560,6 +621,7 @@ static inline void __inquire_remote_apic
}
#endif
+#if 0
#ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the
normal
@@ -745,6 +807,7 @@ wakeup_secondary_cpu(int phys_apicid, un
return (send_status | accept_status);
}
#endif /* WAKE_SECONDARY_VIA_INIT */
+#endif
extern cpumask_t cpu_initialized;
@@ -759,7 +822,15 @@ static int __init do_boot_cpu(int apicid
unsigned long boot_error;
int timeout, cpu;
unsigned long start_eip;
+#if 0
unsigned short nmi_high = 0, nmi_low = 0;
+#endif
+ vcpu_guest_context_t ctxt;
+ extern void startup_32_smp(void);
+ extern void hypervisor_callback(void);
+ extern void failsafe_callback(void);
+ extern void smp_trap_init(trap_info_t *);
+ int i;
cpu = ++cpucount;
/*
@@ -771,7 +842,7 @@ static int __init do_boot_cpu(int apicid
panic("failed fork for CPU %d", cpu);
idle->thread.eip = (unsigned long) start_secondary;
/* start_eip had better be page-aligned! */
- start_eip = setup_trampoline();
+ start_eip = (unsigned long)startup_32_smp;
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
@@ -787,6 +858,107 @@ static int __init do_boot_cpu(int apicid
atomic_set(&init_deasserted, 0);
+#if 1
+ if (cpu_gdt_descr[0].size > PAGE_SIZE)
+ BUG();
+ cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
+ printk("GDT: copying %d bytes from %lx to %lx\n",
+ cpu_gdt_descr[0].size, cpu_gdt_descr[0].address,
+ cpu_gdt_descr[cpu].address);
+ memcpy((void *)cpu_gdt_descr[cpu].address,
+ (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
+
+ memset(&ctxt, 0, sizeof(ctxt));
+
+ ctxt.user_regs.ds = __USER_DS;
+ ctxt.user_regs.es = __USER_DS;
+ ctxt.user_regs.fs = 0;
+ ctxt.user_regs.gs = 0;
+ ctxt.user_regs.ss = __KERNEL_DS;
+ ctxt.user_regs.cs = __KERNEL_CS;
+ ctxt.user_regs.eip = start_eip;
+ ctxt.user_regs.esp = idle->thread.esp;
+ ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12);
+
+ /* FPU is set up to default initial state. */
+ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+
+ /* Virtual IDT is empty at start-of-day. */
+ for ( i = 0; i < 256; i++ )
+ {
+ ctxt.trap_ctxt[i].vector = i;
+ ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
+ }
+ smp_trap_init(ctxt.trap_ctxt);
+
+ /* No LDT. */
+ ctxt.ldt_ents = 0;
+
+ {
+ unsigned long va;
+ int f;
+
+ for (va = cpu_gdt_descr[cpu].address, f = 0;
+ va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
+ va += PAGE_SIZE, f++) {
+ ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+ make_page_readonly((void *)va);
+ }
+ ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8;
+ }
+
+ /* Ring 1 stack is the initial stack. */
+ ctxt.kernel_ss = __KERNEL_DS;
+ ctxt.kernel_sp = idle->thread.esp;
+
+ /* Callback handlers. */
+ ctxt.event_callback_cs = __KERNEL_CS;
+ ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
+ ctxt.failsafe_callback_cs = __KERNEL_CS;
+ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+
+ ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(swapper_pg_dir);
+
+ boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
+ printk("boot error: %ld\n", boot_error);
+
+ if (!boot_error) {
+ /*
+ * allow APs to start initializing.
+ */
+ Dprintk("Before Callout %d.\n", cpu);
+ cpu_set(cpu, cpu_callout_map);
+ Dprintk("After Callout %d.\n", cpu);
+
+ /*
+ * Wait 5s total for a response
+ */
+ for (timeout = 0; timeout < 50000; timeout++) {
+ if (cpu_isset(cpu, cpu_callin_map))
+ break; /* It has booted */
+ udelay(100);
+ }
+
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ /* number CPUs logically, starting from 1 (BSP is 0) */
+ Dprintk("OK.\n");
+ printk("CPU%d: ", cpu);
+ print_cpu_info(&cpu_data[cpu]);
+ Dprintk("CPU has booted.\n");
+ } else {
+ boot_error= 1;
+ }
+ }
+ x86_cpu_to_apicid[cpu] = apicid;
+ if (boot_error) {
+ /* Try to put things back the way they were before ... */
+ unmap_cpu_to_logical_apicid(cpu);
+ cpu_clear(cpu, cpu_callout_map); /* was set here
(do_boot_cpu()) */
+ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+ cpucount--;
+ }
+
+#else
Dprintk("Setting warm reset code and vector.\n");
store_NMI_vector(&nmi_high, &nmi_low);
@@ -844,6 +1016,7 @@ static int __init do_boot_cpu(int apicid
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
+#endif
return boot_error;
}
@@ -882,7 +1055,9 @@ static void smp_tune_scheduling (void)
* Cycle through the processors sending APIC IPIs to boot each.
*/
+#if 0
static int boot_cpu_logical_apicid;
+#endif
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;
@@ -892,8 +1067,11 @@ EXPORT_SYMBOL(cpu_core_map);
static void __init smp_boot_cpus(unsigned int max_cpus)
{
- int apicid, cpu, bit, kicked;
+ int cpu, kicked;
unsigned long bogosum = 0;
+#if 0
+ int apicid, bit;
+#endif
/*
* Setup boot CPU information
@@ -902,9 +1080,15 @@ static void __init smp_boot_cpus(unsigne
printk("CPU%d: ", 0);
print_cpu_info(&cpu_data[0]);
+#if 0
boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
boot_cpu_logical_apicid = logical_smp_processor_id();
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+#else
+ // boot_cpu_physical_apicid = 0;
+ // boot_cpu_logical_apicid = 0;
+ x86_cpu_to_apicid[0] = 0;
+#endif
current_thread_info()->cpu = 0;
smp_tune_scheduling();
@@ -914,6 +1098,7 @@ static void __init smp_boot_cpus(unsigne
cpus_clear(cpu_core_map[0]);
cpu_set(0, cpu_core_map[0]);
+#ifdef CONFIG_X86_IO_APIC
/*
* If we couldn't find an SMP configuration at boot time,
* get out of here now!
@@ -921,16 +1106,22 @@ static void __init smp_boot_cpus(unsigne
if (!smp_found_config && !acpi_lapic) {
printk(KERN_NOTICE "SMP motherboard not detected.\n");
smpboot_clear_io_apic_irqs();
+#if 0
phys_cpu_present_map = physid_mask_of_physid(0);
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
if (APIC_init_uniprocessor())
printk(KERN_NOTICE "Local APIC not detected."
" Using dummy APIC emulation.\n");
+#endif
map_cpu_to_logical_apicid();
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
return;
}
+#endif
+#if 0
/*
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
@@ -953,27 +1144,35 @@ static void __init smp_boot_cpus(unsigne
phys_cpu_present_map = physid_mask_of_physid(0);
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
+ cpu_set(0, cpu_sibling_map[0]);
+ cpu_set(0, cpu_core_map[0]);
return;
}
verify_local_APIC();
+#endif
/*
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
- smp_found_config = 0;
+ HYPERVISOR_shared_info->n_vcpu = 1;
printk(KERN_INFO "SMP mode deactivated, forcing use of dummy
APIC emulation.\n");
smpboot_clear_io_apic_irqs();
+#if 0
phys_cpu_present_map = physid_mask_of_physid(0);
- cpu_set(0, cpu_sibling_map[0]);
- cpu_set(0, cpu_core_map[0]);
+#endif
return;
}
+ smp_intr_init();
+
+#if 0
connect_bsp_APIC();
setup_local_APIC();
+#endif
map_cpu_to_logical_apicid();
+#if 0
setup_portio_remap();
@@ -986,32 +1185,33 @@ static void __init smp_boot_cpus(unsigne
* clustered apic ID.
*/
Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+#endif
+ Dprintk("CPU present map: %lx\n",
+ (1UL << HYPERVISOR_shared_info->n_vcpu) - 1);
kicked = 1;
- for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
- apicid = cpu_present_to_apicid(bit);
- /*
- * Don't even attempt to start the boot CPU!
- */
- if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
- continue;
-
- if (!check_apicid_present(bit))
- continue;
+ for (cpu = 1; kicked < NR_CPUS &&
+ cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) {
if (max_cpus <= cpucount+1)
continue;
- if (do_boot_cpu(apicid))
+#ifdef CONFIG_SMP_ALTERNATIVES
+ if (kicked == 1)
+ prepare_for_smp();
+#endif
+ if (do_boot_cpu(cpu))
printk("CPU #%d not responding - cannot use it.\n",
- apicid);
+ cpu);
else
++kicked;
}
+#if 0
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
+#endif
/*
* Allow the user to impress friends.
@@ -1078,7 +1278,6 @@ static void __init smp_boot_cpus(unsigne
printk(KERN_WARNING "WARNING: %d siblings found for
CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
smp_num_siblings = siblings;
}
-
if (c->x86_num_cores > 1) {
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_isset(i, cpu_callout_map))
@@ -1094,6 +1293,7 @@ static void __init smp_boot_cpus(unsigne
smpboot_setup_io_apic();
+#if 0
setup_boot_APIC_clock();
/*
@@ -1101,12 +1301,16 @@ static void __init smp_boot_cpus(unsigne
*/
if (cpu_has_tsc && cpucount && cpu_khz)
synchronize_tsc_bp();
+#endif
}
/* These are wrappers to interface to the new boot process. Someone
who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
void __init smp_prepare_cpus(unsigned int max_cpus)
{
+ smp_commenced_mask = cpumask_of_cpu(0);
+ cpu_callin_map = cpumask_of_cpu(0);
+ mb();
smp_boot_cpus(max_cpus);
}
@@ -1116,20 +1320,189 @@ void __devinit smp_prepare_boot_cpu(void
cpu_set(smp_processor_id(), cpu_callout_map);
}
-int __devinit __cpu_up(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm-xen/ctrl_if.h>
+
+/* hotplug down/up funtion pointer and target vcpu */
+struct vcpu_hotplug_handler_t {
+ void (*fn)(int vcpu);
+ u32 vcpu;
+};
+static struct vcpu_hotplug_handler_t vcpu_hotplug_handler;
+
+/* must be called with the cpucontrol mutex held */
+static int __devinit cpu_enable(unsigned int cpu)
+{
+#ifdef CONFIG_SMP_ALTERNATIVES
+ if (num_online_cpus() == 1)
+ prepare_for_smp();
+#endif
+
+ /* get the target out of its holding state */
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ wmb();
+
+ /* wait for the processor to ack it. timeout? */
+ while (!cpu_online(cpu))
+ cpu_relax();
+
+ fixup_irqs(cpu_online_map);
+
+ /* counter the disable in fixup_irqs() */
+ local_irq_enable();
+ return 0;
+}
+
+int __cpu_disable(void)
{
- /* This only works at boot for x86. See "rewrite" above. */
- if (cpu_isset(cpu, smp_commenced_mask)) {
- local_irq_enable();
- return -ENOSYS;
+ cpumask_t map = cpu_online_map;
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ cpu_clear(cpu, map);
+ fixup_irqs(map);
+
+ /* It's now safe to remove this processor from the online map */
+ cpu_clear(cpu, cpu_online_map);
+
+#ifdef CONFIG_SMP_ALTERNATIVES
+ if (num_online_cpus() == 1)
+ unprepare_for_smp();
+#endif
+
+ return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We don't do anything here: idle task is faking death itself. */
+ unsigned int i;
+
+ for (i = 0; i < 10; i++) {
+ /* They ack this in play_dead by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+ return;
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+static int vcpu_hotplug_cpu_process(void *unused)
+{
+ struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+
+ if (handler->fn) {
+ (*(handler->fn))(handler->vcpu);
+ handler->fn = NULL;
}
+ return 0;
+}
+
+static void __vcpu_hotplug_handler(void *unused)
+{
+ int err;
+
+ err = kernel_thread(vcpu_hotplug_cpu_process,
+ NULL, CLONE_FS | CLONE_FILES);
+ if (err < 0)
+ printk(KERN_ALERT "Error creating hotplug_cpu process!\n");
+
+}
+
+static void vcpu_hotplug_event_handler(ctrl_msg_t *msg, unsigned long id)
+{
+ static DECLARE_WORK(vcpu_hotplug_work, __vcpu_hotplug_handler, NULL);
+ vcpu_hotplug_t *req = (vcpu_hotplug_t *)&msg->msg[0];
+ struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+ ssize_t ret;
+
+ if (msg->length != sizeof(vcpu_hotplug_t))
+ goto parse_error;
+
+ /* grab target vcpu from msg */
+ handler->vcpu = req->vcpu;
+
+ /* determine which function to call based on msg subtype */
+ switch (msg->subtype) {
+ case CMSG_VCPU_HOTPLUG_OFF:
+ handler->fn = (void *)&cpu_down;
+ ret = schedule_work(&vcpu_hotplug_work);
+ req->status = (u32) ret;
+ break;
+ case CMSG_VCPU_HOTPLUG_ON:
+ handler->fn = (void *)&cpu_up;
+ ret = schedule_work(&vcpu_hotplug_work);
+ req->status = (u32) ret;
+ break;
+ default:
+ goto parse_error;
+ }
+
+ ctrl_if_send_response(msg);
+ return;
+ parse_error:
+ msg->length = 0;
+ ctrl_if_send_response(msg);
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+ struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+
+ handler->fn = NULL;
+ ctrl_if_register_receiver(CMSG_VCPU_HOTPLUG,
+ vcpu_hotplug_event_handler, 0);
+
+ return 0;
+}
+
+__initcall(setup_vcpu_hotplug_event);
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __devinit __cpu_up(unsigned int cpu)
+{
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
+ printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
local_irq_enable();
return -EIO;
}
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_XEN
+ /* Tell hypervisor to bring vcpu up. */
+ HYPERVISOR_vcpu_up(cpu);
+#endif
+ /* Already up, and in cpu_quiescent now? */
+ if (cpu_isset(cpu, smp_commenced_mask)) {
+ cpu_enable(cpu);
+ return 0;
+ }
+#endif
+
local_irq_enable();
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
@@ -1140,6 +1513,8 @@ int __devinit __cpu_up(unsigned int cpu)
void __init smp_cpus_done(unsigned int max_cpus)
{
+#if 1
+#else
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
@@ -1148,25 +1523,26 @@ void __init smp_cpus_done(unsigned int m
* Disable executability of the SMP trampoline:
*/
set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
}
+extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
+extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
+
void __init smp_intr_init(void)
{
- /*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPI for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ int cpu = smp_processor_id();
- /* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ per_cpu(resched_irq, cpu) =
+ bind_ipi_on_cpu_to_irq(RESCHEDULE_VECTOR);
+ sprintf(resched_name[cpu], "resched%d", cpu);
+ BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
+ SA_INTERRUPT, resched_name[cpu], NULL));
+
+ per_cpu(callfunc_irq, cpu) =
+ bind_ipi_on_cpu_to_irq(CALL_FUNCTION_VECTOR);
+ sprintf(callfunc_name[cpu], "callfunc%d", cpu);
+ BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
+ smp_call_function_interrupt,
+ SA_INTERRUPT, callfunc_name[cpu], NULL));
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/smp.c
linux-2.6-xen-sparse/arch/i386/kernel/smp.c
--- pristine-linux-2.6.12/arch/i386/kernel/smp.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/smp.c 2005-07-28 13:17:07.000000000
-0700
@@ -19,10 +19,16 @@
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
+#include <linux/cpu.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
+#if 0
#include <mach_apic.h>
+#endif
+#include <asm-xen/evtchn.h>
+
+#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg)
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -121,31 +127,49 @@ static inline int __prepare_ICR2 (unsign
return SET_APIC_DEST_FIELD(mask);
}
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
+ unsigned int evtchn;
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
+ evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
+ // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu,
vector, evtchn);
+ if (evtchn) {
+#if 0
+ shared_info_t *s = HYPERVISOR_shared_info;
+ while (synch_test_bit(evtchn, &s->evtchn_pending[0]) ||
+ synch_test_bit(evtchn, &s->evtchn_mask[0]))
+ ;
+#endif
+ notify_via_evtchn(evtchn);
+ } else
+ printk("send_IPI to unbound port %d/%d",
+ cpu, vector);
+}
- /*
- * No need to touch the target chip field
- */
- cfg = __prepare_ICR(shortcut, vector);
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+ int cpu;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
+ switch (shortcut) {
+ case APIC_DEST_SELF:
+ __send_IPI_one(smp_processor_id(), vector);
+ break;
+ case APIC_DEST_ALLBUT:
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+ if (cpu_isset(cpu, cpu_online_map)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
+ break;
+ default:
+ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+ vector);
+ break;
+ }
}
void fastcall send_IPI_self(int vector)
@@ -156,81 +180,32 @@ void fastcall send_IPI_self(int vector)
/*
* This is only used on smaller machines.
*/
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(cpumask_t mask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
- unsigned long cfg;
unsigned long flags;
+ unsigned int cpu;
local_irq_save(flags);
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- apic_write_around(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
+ WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
+
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (cpu_isset(cpu, mask)) {
+ __send_IPI_one(cpu, vector);
+ }
+ }
local_irq_restore(flags);
}
void send_IPI_mask_sequence(cpumask_t mask, int vector)
{
- unsigned long cfg, flags;
- unsigned int query_cpu;
-
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
- */
- local_irq_save(flags);
-
- for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
- if (cpu_isset(query_cpu, mask)) {
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
- apic_write_around(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
- }
- }
- local_irq_restore(flags);
+ send_IPI_mask_bitmask(mask, vector);
}
#include <mach_ipi.h> /* must come after the send_IPI functions above for
inlining */
+#if 0 /* XEN */
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
@@ -308,7 +283,8 @@ static inline void leave_mm (unsigned lo
* 2) Leave the mm if we are in the lazy tlb mode.
*/
-fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
{
unsigned long cpu;
@@ -334,32 +310,33 @@ fastcall void smp_invalidate_interrupt(s
} else
leave_mm(cpu);
}
- ack_APIC_irq();
smp_mb__before_clear_bit();
cpu_clear(cpu, flush_cpumask);
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
+
+ return IRQ_HANDLED;
}
static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
unsigned long va)
{
- cpumask_t tmp;
/*
* A couple of (to be removed) sanity checks:
*
- * - we do not send IPIs to not-yet booted CPUs.
* - current CPU must not be in mask
* - mask must exist :)
*/
BUG_ON(cpus_empty(cpumask));
-
- cpus_and(tmp, cpumask, cpu_online_map);
- BUG_ON(!cpus_equal(cpumask, tmp));
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);
+ /* If a CPU which we ran on has gone down, OK. */
+ cpus_and(cpumask, cpumask, cpu_online_map);
+ if (cpus_empty(cpumask))
+ return;
+
/*
* i'm not happy about this global shared spinlock in the
* MM hot path, but we'll see how contended it is.
@@ -443,7 +420,7 @@ void flush_tlb_page(struct vm_area_struc
if (current->active_mm == mm) {
if(current->mm)
__flush_tlb_one(va);
- else
+ else
leave_mm(smp_processor_id());
}
@@ -467,6 +444,22 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
}
+#else
+
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
+{ return 0; }
+void flush_tlb_current_task(void)
+{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); }
+void flush_tlb_mm(struct mm_struct * mm)
+{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
+void flush_tlb_all(void)
+{ xen_tlb_flush_all(); }
+
+#endif /* XEN */
+
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
@@ -474,6 +467,7 @@ void flush_tlb_all(void)
*/
void smp_send_reschedule(int cpu)
{
+ WARN_ON(cpu_is_offline(cpu));
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
@@ -514,10 +508,16 @@ int smp_call_function (void (*func) (voi
*/
{
struct call_data_struct data;
- int cpus = num_online_cpus()-1;
+ int cpus;
- if (!cpus)
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+ cpus = num_online_cpus()-1;
+
+ if (!cpus) {
+ spin_unlock(&call_lock);
return 0;
+ }
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
@@ -529,7 +529,6 @@ int smp_call_function (void (*func) (voi
if (wait)
atomic_set(&data.finished, 0);
- spin_lock(&call_lock);
call_data = &data;
mb();
@@ -538,11 +537,11 @@ int smp_call_function (void (*func) (voi
/* Wait for response */
while (atomic_read(&data.started) != cpus)
- cpu_relax();
+ barrier();
if (wait)
while (atomic_read(&data.finished) != cpus)
- cpu_relax();
+ barrier();
spin_unlock(&call_lock);
return 0;
@@ -555,7 +554,11 @@ static void stop_this_cpu (void * dummy)
*/
cpu_clear(smp_processor_id(), cpu_online_map);
local_irq_disable();
+#if 1
+ xxprint("stop_this_cpu disable_local_APIC\n");
+#else
disable_local_APIC();
+#endif
if (cpu_data[smp_processor_id()].hlt_works_ok)
for(;;) __asm__("hlt");
for (;;);
@@ -570,7 +573,11 @@ void smp_send_stop(void)
smp_call_function(stop_this_cpu, NULL, 1, 0);
local_irq_disable();
+#if 1
+ xxprint("smp_send_stop disable_local_APIC\n");
+#else
disable_local_APIC();
+#endif
local_irq_enable();
}
@@ -579,18 +586,21 @@ void smp_send_stop(void)
* all the work is done automatically when
* we return from the interrupt.
*/
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
{
- ack_APIC_irq();
+
+ return IRQ_HANDLED;
}
-fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+#include <linux/kallsyms.h>
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
+ struct pt_regs *regs)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
- ack_APIC_irq();
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
@@ -608,5 +618,7 @@ fastcall void smp_call_function_interrup
mb();
atomic_inc(&call_data->finished);
}
+
+ return IRQ_HANDLED;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/time.c
linux-2.6-xen-sparse/arch/i386/kernel/time.c
--- pristine-linux-2.6.12/arch/i386/kernel/time.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/time.c 2005-07-28
13:17:07.000000000 -0700
@@ -46,6 +46,8 @@
#include <linux/bcd.h>
#include <linux/efi.h>
#include <linux/mca.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -71,13 +73,24 @@
extern spinlock_t i8259A_lock;
int pit_latch_buggy; /* extern */
-#include "do_timer.h"
-
u64 jiffies_64 = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
+#if defined(__x86_64__)
+unsigned long vxtime_hz = PIT_TICK_RATE;
+struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+#endif
+
+#if defined(__x86_64__)
+unsigned int cpu_khz; /* Detected as we calibrate the TSC */
+#else
unsigned long cpu_khz; /* Detected as we calibrate the TSC */
+#endif
extern unsigned long wall_jiffies;
@@ -86,7 +99,210 @@ DEFINE_SPINLOCK(rtc_lock);
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
-struct timer_opts *cur_timer = &timer_none;
+extern struct init_timer_opts timer_tsc_init;
+extern struct timer_opts timer_tsc;
+struct timer_opts *cur_timer = &timer_tsc;
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ u32 tsc_to_usec_mul;
+ int tsc_shift;
+ u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timeval shadow_tv;
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time; /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+#define NS_PER_TICK (1000000000ULL/HZ)
+
+#define HANDLE_USEC_UNDERFLOW(_tv) do { \
+ while ((_tv).tv_usec < 0) { \
+ (_tv).tv_usec += USEC_PER_SEC; \
+ (_tv).tv_sec--; \
+ } \
+} while (0)
+#define HANDLE_USEC_OVERFLOW(_tv) do { \
+ while ((_tv).tv_usec >= USEC_PER_SEC) { \
+ (_tv).tv_usec -= USEC_PER_SEC; \
+ (_tv).tv_sec++; \
+ } \
+} while (0)
+static inline void __normalize_time(time_t *sec, s64 *nsec)
+{
+ while (*nsec >= NSEC_PER_SEC) {
+ (*nsec) -= NSEC_PER_SEC;
+ (*sec)++;
+ }
+ while (*nsec < 0) {
+ (*nsec) += NSEC_PER_SEC;
+ (*sec)--;
+ }
+}
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+ independent_wallclock = 1;
+ return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+#define INDEPENDENT_WALLCLOCK() \
+ (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
+
+int tsc_disable __initdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+ unsigned long bclock, now;
+
+ rdtscl(bclock);
+ do
+ {
+ rep_nop();
+ rdtscl(now);
+ } while ((now-bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+ .name = "tsc",
+ .delay = delay_tsc,
+};
+
+static inline u32 down_shift(u64 time, int shift)
+{
+ if ( shift < 0 )
+ return (u32)(time >> -shift);
+ return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+ u32 product_int, product_frac;
+ __asm__ (
+ "mul %3"
+ : "=a" (product_frac), "=d" (product_int)
+ : "0" (multiplicand), "r" (multiplier) );
+ return product_int;
+}
+
+void init_cpu_khz(void)
+{
+ u64 __cpu_khz = 1000000ULL << 32;
+ struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0];
+ do_div(__cpu_khz, info->tsc_to_system_mul);
+ cpu_khz = down_shift(__cpu_khz, -info->tsc_shift);
+ printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+ u64 now;
+ u32 delta;
+ rdtscll(now);
+ delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+ return mul_frac(delta, shadow->tsc_to_nsec_mul);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+ u64 now;
+ u32 delta;
+ rdtscll(now);
+ delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+ return mul_frac(delta, shadow->tsc_to_usec_mul);
+}
+
+static void update_wallclock(void)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ long wtm_nsec, xtime_nsec;
+ time_t wtm_sec, xtime_sec;
+ u64 tmp, usec;
+
+ shadow_tv.tv_sec = s->wc_sec;
+ shadow_tv.tv_usec = s->wc_usec;
+
+ if (INDEPENDENT_WALLCLOCK())
+ return;
+
+ if ((time_status & STA_UNSYNC) != 0)
+ return;
+
+ /* Adjust wall-clock time base based on wall_jiffies ticks. */
+ usec = processed_system_time;
+ do_div(usec, 1000);
+ usec += (u64)shadow_tv.tv_sec * 1000000ULL;
+ usec += (u64)shadow_tv.tv_usec;
+ usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
+
+ /* Split wallclock base into seconds and nanoseconds. */
+ tmp = usec;
+ xtime_nsec = do_div(tmp, 1000000) * 1000ULL;
+ xtime_sec = (time_t)tmp;
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
+
+ set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area. Must be called with the xtime_lock held for writing.
+ */
+static void __get_time_values_from_xen(void)
+{
+ shared_info_t *s = HYPERVISOR_shared_info;
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ src = &s->vcpu_time[smp_processor_id()];
+ dst = &per_cpu(shadow_time, smp_processor_id());
+
+ do {
+ dst->version = src->time_version2;
+ rmb();
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb();
+ }
+ while (dst->version != src->time_version1);
+
+ dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+ if ((shadow_tv.tv_sec != s->wc_sec) ||
+ (shadow_tv.tv_usec != s->wc_usec))
+ update_wallclock();
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()];
+ dst = &per_cpu(shadow_time, smp_processor_id());
+
+ return (dst->version == src->time_version2);
+}
+
+#define TIME_VALUES_UP_TO_DATE \
+ ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); })
/*
* This is a special lock that is owned by the CPU and holds the index
@@ -126,13 +342,20 @@ void do_gettimeofday(struct timeval *tv)
unsigned long seq;
unsigned long usec, sec;
unsigned long max_ntp_tick;
+ unsigned long flags;
+ s64 nsec;
+ unsigned int cpu;
+ struct shadow_time_info *shadow;
+
+ cpu = get_cpu();
+ shadow = &per_cpu(shadow_time, cpu);
do {
unsigned long lost;
seq = read_seqbegin(&xtime_lock);
- usec = cur_timer->get_offset();
+ usec = get_usec_offset(shadow);
lost = jiffies - wall_jiffies;
/*
@@ -151,11 +374,31 @@ void do_gettimeofday(struct timeval *tv)
usec += lost * (USEC_PER_SEC / HZ);
sec = xtime.tv_sec;
- usec += (xtime.tv_nsec / 1000);
+ usec += (xtime.tv_nsec / NSEC_PER_USEC);
+
+ nsec = shadow->system_timestamp - processed_system_time;
+ __normalize_time(&sec, &nsec);
+ usec += (long)nsec / NSEC_PER_USEC;
+
+ if (unlikely(!time_values_up_to_date(cpu))) {
+ /*
+ * We may have blocked for a long time,
+ * rendering our calculations invalid
+ * (e.g. the time delta may have
+ * overflowed). Detect that and recalculate
+ * with fresh values.
+ */
+ write_seqlock_irqsave(&xtime_lock, flags);
+ __get_time_values_from_xen();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ continue;
+ }
} while (read_seqretry(&xtime_lock, seq));
- while (usec >= 1000000) {
- usec -= 1000000;
+ put_cpu();
+
+ while (usec >= USEC_PER_SEC) {
+ usec -= USEC_PER_SEC;
sec++;
}
@@ -168,21 +411,49 @@ EXPORT_SYMBOL(do_gettimeofday);
int do_settimeofday(struct timespec *tv)
{
time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
+ long wtm_nsec;
+ s64 nsec;
+ struct timespec xentime;
+ unsigned int cpu;
+ struct shadow_time_info *shadow;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
+ if (!INDEPENDENT_WALLCLOCK())
+ return 0; /* Silent failure? */
+
+ cpu = get_cpu();
+ shadow = &per_cpu(shadow_time, cpu);
+
write_seqlock_irq(&xtime_lock);
+
+ /*
+ * Ensure we don't get blocked for a long time so that our time delta
+ * overflows. If that were to happen then our shadow time values would
+ * be stale, so we can retry with fresh ones.
+ */
+ again:
+ nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow);
+ if (unlikely(!time_values_up_to_date(cpu))) {
+ __get_time_values_from_xen();
+ goto again;
+ }
+
+ __normalize_time(&sec, &nsec);
+ set_normalized_timespec(&xentime, sec, nsec);
+
/*
* This is revolting. We need to set "xtime" correctly. However, the
* value in this location is the value at the most recent update of
* wall time. Discover what correction gettimeofday() would have
* made, and then undo it!
*/
- nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
+ nsec -= (shadow->system_timestamp - processed_system_time);
+
+ __normalize_time(&sec, &nsec);
wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -193,13 +464,29 @@ int do_settimeofday(struct timespec *tv)
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
- write_sequnlock_irq(&xtime_lock);
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ if (xen_start_info.flags & SIF_INITDOMAIN) {
+ dom0_op_t op;
+ op.cmd = DOM0_SETTIME;
+ op.u.settime.secs = xentime.tv_sec;
+ op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC;
+ op.u.settime.system_time = shadow->system_timestamp;
+ write_sequnlock_irq(&xtime_lock);
+ HYPERVISOR_dom0_op(&op);
+ } else
+#endif
+ write_sequnlock_irq(&xtime_lock);
+
+ put_cpu();
+
clock_was_set();
return 0;
}
EXPORT_SYMBOL(do_settimeofday);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
static int set_rtc_mmss(unsigned long nowtime)
{
int retval;
@@ -216,9 +503,12 @@ static int set_rtc_mmss(unsigned long no
return retval;
}
-
-
-int timer_ack;
+#else
+static int set_rtc_mmss(unsigned long nowtime)
+{
+ return 0;
+}
+#endif
/* monotonic_clock(): returns # of nanoseconds passed since time_init()
* Note: This function is required to return accurate
@@ -226,10 +516,31 @@ int timer_ack;
*/
unsigned long long monotonic_clock(void)
{
- return cur_timer->monotonic_clock();
+ int cpu = get_cpu();
+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+ s64 off;
+ unsigned long flags;
+
+ for ( ; ; ) {
+ off = get_nsec_offset(shadow);
+ if (time_values_up_to_date(cpu))
+ break;
+ write_seqlock_irqsave(&xtime_lock, flags);
+ __get_time_values_from_xen();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ }
+
+ put_cpu();
+
+ return shadow->system_timestamp + off;
}
EXPORT_SYMBOL(monotonic_clock);
+unsigned long long sched_clock(void)
+{
+ return monotonic_clock();
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
unsigned long profile_pc(struct pt_regs *regs)
{
@@ -250,37 +561,47 @@ EXPORT_SYMBOL(profile_pc);
static inline void do_timer_interrupt(int irq, void *dev_id,
struct pt_regs *regs)
{
-#ifdef CONFIG_X86_IO_APIC
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to reset the IRR bit for do_slow_gettimeoffset().
- * This will also deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
- }
-#endif
+ s64 delta, delta_cpu;
+ int cpu = smp_processor_id();
+ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+ do {
+ __get_time_values_from_xen();
- do_timer_interrupt_hook(regs);
+ delta = delta_cpu =
+ shadow->system_timestamp + get_nsec_offset(shadow);
+ delta -= processed_system_time;
+ delta_cpu -= per_cpu(processed_system_time, cpu);
+ }
+ while (!time_values_up_to_date(cpu));
+ if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
+ printk("Timer ISR/%d: Time went backwards: "
+ "delta=%lld cpu_delta=%lld shadow=%lld "
+ "off=%lld processed=%lld cpu_processed=%lld\n",
+ cpu, delta, delta_cpu, shadow->system_timestamp,
+ (s64)get_nsec_offset(shadow),
+ processed_system_time,
+ per_cpu(processed_system_time, cpu));
+ for (cpu = 0; cpu < num_online_cpus(); cpu++)
+ printk(" %d: %lld\n", cpu,
+ per_cpu(processed_system_time, cpu));
+ return;
+ }
- if (MCA_bus) {
- /* The PS/2 uses level-triggered interrupts. You can't
- turn them off, nor would you want to (any attempt to
- enable edge-triggered interrupts usually gets intercepted by a
- special hardware circuit). Hence we have to acknowledge
- the timer interrupt. Through some incredibly stupid
- design idea, the reset for IRQ 0 is done by setting the
- high bit of the PPI port B (0x61). Note that some PS/2s,
- notably the 55SX, work fine if this is removed. */
+ /* System-wide jiffy work. */
+ while (delta >= NS_PER_TICK) {
+ delta -= NS_PER_TICK;
+ processed_system_time += NS_PER_TICK;
+ do_timer(regs);
+ }
- irq = inb_p( 0x61 ); /* read the current state */
- outb_p( irq|0x80, 0x61 ); /* reset the IRQ */
+ /* Local CPU jiffy work. */
+ while (delta_cpu >= NS_PER_TICK) {
+ delta_cpu -= NS_PER_TICK;
+ per_cpu(processed_system_time, cpu) += NS_PER_TICK;
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING, regs);
}
}
@@ -299,11 +620,7 @@ irqreturn_t timer_interrupt(int irq, voi
* locally disabled. -arca
*/
write_seqlock(&xtime_lock);
-
- cur_timer->mark_offset();
-
do_timer_interrupt(irq, NULL, regs);
-
write_sequnlock(&xtime_lock);
return IRQ_HANDLED;
}
@@ -452,6 +769,14 @@ static void __init hpet_time_init(void)
}
#endif
+/* Dynamically-mapped IRQ. */
+static DEFINE_PER_CPU(int, timer_irq);
+
+static struct irqaction irq_timer = {
+ timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0",
+ NULL, NULL
+};
+
void __init time_init(void)
{
#ifdef CONFIG_HPET_TIMER
@@ -464,13 +789,141 @@ void __init time_init(void)
return;
}
#endif
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+ __get_time_values_from_xen();
+ xtime.tv_sec = shadow_tv.tv_sec;
+ xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
+ processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+ per_cpu(processed_system_time, 0) = processed_system_time;
- cur_timer = select_timer();
- printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+ init_cpu_khz();
- time_init_hook();
+#if defined(__x86_64__)
+ vxtime.mode = VXTIME_TSC;
+ vxtime.quot = (1000000L << 32) / vxtime_hz;
+ vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+ vxtime.hz = vxtime_hz;
+ sync_core();
+ rdtscll(vxtime.last_tsc);
+#endif
+
+ per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER);
+ (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer);
+}
+
+/* Convert jiffies to system time. */
+static inline u64 jiffies_to_st(unsigned long j)
+{
+ unsigned long seq;
+ long delta;
+ u64 st;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ delta = j - jiffies;
+ /* NB. The next check can trigger in some wrap-around cases,
+ * but that's ok: we'll just end up with a shorter timeout. */
+ if (delta < 1)
+ delta = 1;
+ st = processed_system_time + (delta * NS_PER_TICK);
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return st;
}
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned long j;
+
+ /* s390 does this /before/ checking rcu_pending(). We copy them. */
+ cpu_set(cpu, nohz_cpu_mask);
+
+ /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
+ if (rcu_pending(cpu) || local_softirq_pending()) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ j = jiffies + 1;
+ } else {
+ j = next_timer_interrupt();
+ }
+
+ BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
+}
+
+void start_hz_timer(void)
+{
+ cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
+void time_suspend(void)
+{
+ /* nothing */
+}
+
+/* No locking required. We are only CPU running, and interrupts are off. */
+void time_resume(void)
+{
+ init_cpu_khz();
+
+ /* Get timebases for new environment. */
+ __get_time_values_from_xen();
+
+ /* Reset our own concept of passage of system time. */
+ processed_system_time =
+ per_cpu(shadow_time, smp_processor_id()).system_timestamp;
+ per_cpu(processed_system_time, 0) = processed_system_time;
+}
+
+#ifdef CONFIG_SMP
+static char timer_name[NR_CPUS][15];
+void local_setup_timer(void)
+{
+ int seq, cpu = smp_processor_id();
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ per_cpu(processed_system_time, cpu) =
+ per_cpu(shadow_time, cpu).system_timestamp;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
+ sprintf(timer_name[cpu], "timer%d", cpu);
+ BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt,
+ SA_INTERRUPT, timer_name[cpu], NULL));
+}
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+ {1, "independent_wallclock", &independent_wallclock,
+ sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
+ {0}
+};
+static ctl_table xen_table[] = {
+ {123, "xen", NULL, 0, 0555, xen_subtable},
+ {0}
+};
+static int __init xen_sysctl_init(void)
+{
+ (void)register_sysctl_table(xen_table, 0);
+ return 0;
+}
+__initcall(xen_sysctl_init);
+
+/*
+ * Local variables:
+ * c-file-style: "linux"
+ * indent-tabs-mode: t
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile
linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -2,8 +2,16 @@
# Makefile for x86 timers
#
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
+XENARCH := $(subst ",,$(CONFIG_XENARCH))
-obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER) += timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o
+obj-y := timer_tsc.o
+c-obj-y :=
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+ @ln -fsn $(srctree)/arch/i386/kernel/timers/$(notdir $@) $@
+
+obj-y += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c
linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c
--- pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c 2005-07-28
13:17:07.000000000 -0700
@@ -1,10 +1,6 @@
/*
* This code largely moved from arch/i386/kernel/time.c.
* See comments there for proper credits.
- *
- * 2004-06-25 Jesper Juhl
- * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- * failing to inline.
*/
#include <linux/spinlock.h>
@@ -38,12 +34,9 @@ int tsc_disable __initdata = 0;
extern spinlock_t i8253_lock;
static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
static unsigned long long monotonic_base;
+static u32 monotonic_offset;
static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
/* convert from cycles(64bits) => nanoseconds (64bits)
@@ -74,8 +67,6 @@ static inline unsigned long long cycles_
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
-static int count2; /* counter for mark_offset_tsc() */
-
/* Cached *multiplier* to convert TSC counts to microseconds.
* (see the equation below).
* Equal to 2^32 * (1 / (clocks per usec) ).
@@ -83,6 +74,9 @@ static int count2; /* counter for mark_o
*/
static unsigned long fast_gettimeoffset_quotient;
+extern u32 shadow_tsc_stamp;
+extern u64 shadow_system_time;
+
static unsigned long get_offset_tsc(void)
{
register unsigned long eax, edx;
@@ -92,7 +86,7 @@ static unsigned long get_offset_tsc(void
rdtsc(eax,edx);
/* .. relative to previous jiffy (32 bits is enough) */
- eax -= last_tsc_low; /* tsc_low delta */
+ eax -= shadow_tsc_stamp;
/*
* Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
@@ -109,7 +103,7 @@ static unsigned long get_offset_tsc(void
"0" (eax));
/* our adjusted time offset in microseconds */
- return delay_at_last_interrupt + edx;
+ return edx;
}
static unsigned long long monotonic_clock_tsc(void)
@@ -120,7 +114,7 @@ static unsigned long long monotonic_cloc
/* atomically read monotonic base & last_offset */
do {
seq = read_seqbegin(&monotonic_lock);
- last_offset = ((unsigned long
long)last_tsc_high<<32)|last_tsc_low;
+ last_offset = monotonic_offset;
base = monotonic_base;
} while (read_seqretry(&monotonic_lock, seq));
@@ -155,6 +149,17 @@ unsigned long long sched_clock(void)
return cycles_2_ns(this_offset);
}
+
+static void mark_offset_tsc(void)
+{
+
+ /* update the monotonic base value */
+ write_seqlock(&monotonic_lock);
+ monotonic_base = shadow_system_time;
+ monotonic_offset = shadow_tsc_stamp;
+ write_sequnlock(&monotonic_lock);
+}
+
static void delay_tsc(unsigned long loops)
{
unsigned long bclock, now;
@@ -320,245 +325,39 @@ core_initcall(cpufreq_tsc);
static inline void cpufreq_delayed_get(void) { return; }
#endif
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
- unsigned long cpu_khz_old = cpu_khz;
-
- if (cpu_has_tsc) {
- init_cpu_khz();
- cpu_data[0].loops_per_jiffy =
- cpufreq_scale(cpu_data[0].loops_per_jiffy,
- cpu_khz_old,
- cpu_khz);
- return 0;
- } else
- return -ENODEV;
-#else
- return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-static void mark_offset_tsc(void)
+static int init_tsc(char* override)
{
- unsigned long lost,delay;
- unsigned long delta = last_tsc_low;
- int count;
- int countmp;
- static int count1 = 0;
- unsigned long long this_offset, last_offset;
- static int lost_count = 0;
-
- write_seqlock(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- /*
- * It is important that these two operations happen almost at
- * the same time. We do the RDTSC stuff first, since it's
- * faster. To avoid any inconsistencies, we need interrupts
- * disabled locally.
- */
-
- /*
- * Interrupts are just disabled locally since the timer irq
- * has the SA_INTERRUPT flag set. -arca
- */
-
- /* read Pentium cycle counter */
-
- rdtsc(last_tsc_low, last_tsc_high);
-
- spin_lock(&i8253_lock);
- outb_p(0x00, PIT_MODE); /* latch the count ASAP */
-
- count = inb_p(PIT_CH0); /* read the latched count */
- count |= inb(PIT_CH0) << 8;
-
- /*
- * VIA686a test code... reset the latch if count > max + 1
- * from timer_pit.c - cjb
- */
- if (count > LATCH) {
- outb_p(0x34, PIT_MODE);
- outb_p(LATCH & 0xff, PIT_CH0);
- outb(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- spin_unlock(&i8253_lock);
+ u64 __cpu_khz;
- if (pit_latch_buggy) {
- /* get center value of last 3 time lutch */
- if ((count2 >= count && count >= count1)
- || (count1 >= count && count >= count2)) {
- count2 = count1; count1 = count;
- } else if ((count1 >= count2 && count2 >= count)
- || (count >= count2 && count2 >= count1)) {
- countmp = count;count = count2;
- count2 = count1;count1 = countmp;
- } else {
- count2 = count1; count1 = count; count = count1;
- }
- }
+ __cpu_khz = HYPERVISOR_shared_info->cpu_freq;
+ do_div(__cpu_khz, 1000);
+ cpu_khz = (u32)__cpu_khz;
+ printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
- /* lost tick compensation */
- delta = last_tsc_low - delta;
+ /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz =
+ (2^32 * 1 / (clocks/us)) */
{
- register unsigned long eax, edx;
- eax = delta;
- __asm__("mull %2"
- :"=a" (eax), "=d" (edx)
- :"rm" (fast_gettimeoffset_quotient),
- "0" (eax));
- delta = edx;
- }
- delta += delay_at_last_interrupt;
- lost = delta/(1000000/HZ);
- delay = delta%(1000000/HZ);
- if (lost >= 2) {
- jiffies_64 += lost-1;
-
- /* sanity check to ensure we're not always losing ticks */
- if (lost_count++ > 100) {
- printk(KERN_WARNING "Losing too many ticks!\n");
- printk(KERN_WARNING "TSC cannot be used as a
timesource. \n");
- printk(KERN_WARNING "Possible reasons for this are:\n");
- printk(KERN_WARNING " You're running with
Speedstep,\n");
- printk(KERN_WARNING " You don't have DMA enabled for
your hard disk (see hdparm),\n");
- printk(KERN_WARNING " Incorrect TSC synchronization on
an SMP system (see dmesg).\n");
- printk(KERN_WARNING "Falling back to a sane timesource
now.\n");
-
- clock_fallback();
- }
- /* ... but give the TSC a fair chance */
- if (lost_count > 25)
- cpufreq_delayed_get();
- } else
- lost_count = 0;
- /* update the monotonic base value */
- this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- monotonic_base += cycles_2_ns(this_offset - last_offset);
- write_sequnlock(&monotonic_lock);
-
- /* calculate delay_at_last_interrupt */
- count = ((LATCH-1) - count) * TICK_SIZE;
- delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
- /* catch corner case where tick rollover occured
- * between tsc and pit reads (as noted when
- * usec delta is > 90% # of usecs/tick)
- */
- if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
- jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
- /* check clock override */
- if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled()) {
- printk(KERN_ERR "Warning: clock= override failed.
Defaulting to tsc\n");
- } else
-#endif
- {
- return -ENODEV;
- }
+ unsigned long eax=0, edx=1000;
+ __asm__("divl %2"
+ :"=a" (fast_gettimeoffset_quotient), "=d" (edx)
+ :"r" (cpu_khz),
+ "0" (eax), "1" (edx));
}
- /*
- * If we have APM enabled or the CPU clock speed is variable
- * (CPU stops clock on HLT or slows clock to save power)
- * then the TSC timestamps may diverge by up to 1 jiffy from
- * 'real time' but nothing will break.
- * The most frequent case is that the CPU is "woken" from a halt
- * state by the timer interrupt itself, so we get 0 error. In the
- * rare cases where a driver would "wake" the CPU and request a
- * timestamp, the maximum error is < 1 jiffy. But timestamps are
- * still perfectly ordered.
- * Note that the TSC counter will be reset if APM suspends
- * to disk; this won't break the kernel, though, 'cuz we're
- * smart. See arch/i386/kernel/apm.c.
- */
- /*
- * Firstly we have to do a CPU check for chips with
- * a potentially buggy TSC. At this point we haven't run
- * the ident/bugs checks so we must run this hook as it
- * may turn off the TSC flag.
- *
- * NOTE: this doesn't yet handle SMP 486 machines where only
- * some CPU's have a TSC. Thats never worked and nobody has
- * moaned if you have the only one in the world - you fix it!
- */
-
- count2 = LATCH; /* initialize counter for mark_offset_tsc() */
+ set_cyc2ns_scale(cpu_khz/1000);
- if (cpu_has_tsc) {
- unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled() && hpet_use_timer) {
- unsigned long result, remain;
- printk("Using TSC for gettimeofday\n");
- tsc_quotient = calibrate_tsc_hpet(NULL);
- timer_tsc.mark_offset = &mark_offset_tsc_hpet;
- /*
- * Math to calculate hpet to usec multiplier
- * Look for the comments at get_offset_tsc_hpet()
- */
- ASM_DIV64_REG(result, remain, hpet_tick,
- 0, KERNEL_TICK_USEC);
- if (remain > (hpet_tick >> 1))
- result++; /* rounding the result */
+ use_tsc = 1;
- hpet_usec_quotient = result;
- } else
-#endif
- {
- tsc_quotient = calibrate_tsc();
- }
-
- if (tsc_quotient) {
- fast_gettimeoffset_quotient = tsc_quotient;
- use_tsc = 1;
- /*
- * We could be more selective here I suspect
- * and just enable this for the next intel chips ?
- */
- /* report CPU clock rate in Hz.
- * The formula is (10^6 * 2^32) / (2^32 * 1 /
(clocks/us)) =
- * clock/second. Our precision is about 100 ppm.
- */
- { unsigned long eax=0, edx=1000;
- __asm__("divl %2"
- :"=a" (cpu_khz), "=d" (edx)
- :"r" (tsc_quotient),
- "0" (eax), "1" (edx));
- printk("Detected %lu.%03lu MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
- }
- set_cyc2ns_scale(cpu_khz/1000);
- return 0;
- }
- }
- return -ENODEV;
+ return 0;
}
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc. Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
static int __init tsc_setup(char *str)
{
- tsc_disable = 1;
+ printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n");
return 1;
}
-#else
-static int __init tsc_setup(char *str)
-{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC.\n");
- return 1;
-}
-#endif
__setup("notsc", tsc_setup);
@@ -566,7 +365,7 @@ __setup("notsc", tsc_setup);
/************************************************************/
/* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
+struct timer_opts timer_tsc = {
.name = "tsc",
.mark_offset = mark_offset_tsc,
.get_offset = get_offset_tsc,
@@ -574,7 +373,7 @@ static struct timer_opts timer_tsc = {
.delay = delay_tsc,
};
-struct init_timer_opts __initdata timer_tsc_init = {
+struct init_timer_opts timer_tsc_init = {
.init = init_tsc,
.opts = &timer_tsc,
};
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/traps.c
linux-2.6-xen-sparse/arch/i386/kernel/traps.c
--- pristine-linux-2.6.12/arch/i386/kernel/traps.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/traps.c 2005-07-28
13:17:07.000000000 -0700
@@ -58,9 +58,6 @@
asmlinkage int system_call(void);
-struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
- { 0, 0 }, { 0, 0 } };
-
/* Do we ignore FPU interrupts ? */
char ignore_fpu_irq = 0;
@@ -88,7 +85,7 @@ asmlinkage void page_fault(void);
asmlinkage void coprocessor_error(void);
asmlinkage void simd_coprocessor_error(void);
asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void fixup_4gb_segment(void);
asmlinkage void machine_check(void);
static int kstack_depth_to_print = 24;
@@ -209,7 +206,7 @@ void show_registers(struct pt_regs *regs
esp = (unsigned long) (®s->esp);
ss = __KERNEL_DS;
- if (regs->xcs & 3) {
+ if (regs->xcs & 2) {
in_kernel = 0;
esp = regs->esp;
ss = regs->xss & 0xffff;
@@ -265,7 +262,7 @@ static void handle_BUG(struct pt_regs *r
char c;
unsigned long eip;
- if (regs->xcs & 3)
+ if (regs->xcs & 2)
goto no_bug; /* Not in kernel */
eip = regs->eip;
@@ -353,7 +350,7 @@ void die(const char * str, struct pt_reg
static inline void die_if_kernel(const char * str, struct pt_regs * regs, long
err)
{
- if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))
+ if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs))
die(str, regs, err);
}
@@ -366,7 +363,7 @@ static void do_trap(int trapnr, int sign
goto trap_signal;
}
- if (!(regs->xcs & 3))
+ if (!(regs->xcs & 2))
goto kernel_trap;
trap_signal: {
@@ -446,49 +443,37 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN,
regs->eip)
+DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available)
DO_ERROR( 9, SIGFPE, "coprocessor segment overrun",
coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+#ifdef CONFIG_X86_MCE
+DO_ERROR(18, SIGBUS, "machine check", machine_check)
+#endif
fastcall void do_general_protection(struct pt_regs * regs, long error_code)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = ¤t->thread;
-
/*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
- * and we set the offset field correctly. Then we let the CPU to
- * restart the faulting instruction.
- */
- if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
- thread->io_bitmap_ptr) {
- memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
- thread->io_bitmap_max);
- /*
- * If the previously set map was extending to higher ports
- * than the current one, pad extra space with 0xff (no access).
- */
- if (thread->io_bitmap_max < tss->io_bitmap_max)
- memset((char *) tss->io_bitmap +
- thread->io_bitmap_max, 0xff,
- tss->io_bitmap_max - thread->io_bitmap_max);
- tss->io_bitmap_max = thread->io_bitmap_max;
- tss->io_bitmap_base = IO_BITMAP_OFFSET;
- put_cpu();
- return;
+ * If we trapped on an LDT access then ensure that the default_ldt is
+ * loaded, if nothing else. We load default_ldt lazily because LDT
+ * switching costs time and many applications don't need it.
+ */
+ if (unlikely((error_code & 6) == 4)) {
+ unsigned long ldt;
+ __asm__ __volatile__ ("sldt %0" : "=r" (ldt));
+ if (ldt == 0) {
+ xen_set_ldt((unsigned long)&default_ldt[0], 5);
+ return;
+ }
}
- put_cpu();
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
- if (!(regs->xcs & 3))
+ if (!(regs->xcs & 2))
goto gp_in_kernel;
current->thread.error_code = error_code;
@@ -624,6 +609,14 @@ fastcall void do_nmi(struct pt_regs * re
nmi_enter();
cpu = smp_processor_id();
+
+#ifdef CONFIG_HOTPLUG_CPU
+ if (!cpu_online(cpu)) {
+ nmi_exit();
+ return;
+ }
+#endif
+
++nmi_count(cpu);
if (!nmi_callback(regs, cpu))
@@ -682,14 +675,16 @@ fastcall void do_debug(struct pt_regs *
unsigned int condition;
struct task_struct *tsk = current;
- __asm__ __volatile__("movl %%db6,%0" : "=r" (condition));
+ condition = HYPERVISOR_get_debugreg(6);
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
SIGTRAP) == NOTIFY_STOP)
return;
+#if 0
/* It's safe to allow irq's after DR6 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
local_irq_enable();
+#endif
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -713,7 +708,7 @@ fastcall void do_debug(struct pt_regs *
* check for kernel mode by just checking the CPL
* of CS.
*/
- if ((regs->xcs & 3) == 0)
+ if ((regs->xcs & 2) == 0)
goto clear_TF_reenable;
}
@@ -724,9 +719,7 @@ fastcall void do_debug(struct pt_regs *
* the signal is delivered.
*/
clear_dr7:
- __asm__("movl %0,%%db7"
- : /* no output */
- : "r" (0));
+ HYPERVISOR_set_debugreg(7, 0);
return;
debug_vm86:
@@ -878,15 +871,6 @@ fastcall void do_simd_coprocessor_error(
}
}
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
- long error_code)
-{
-#if 0
- /* No need to warn about this any longer. */
- printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
fastcall void setup_x86_bogus_stack(unsigned char * stk)
{
unsigned long *switch16_ptr, *switch32_ptr;
@@ -947,7 +931,7 @@ asmlinkage void math_state_restore(struc
struct thread_info *thread = current_thread_info();
struct task_struct *tsk = thread->task;
- clts(); /* Allow maths ops (or we recurse) */
+ /* NB. 'clts' is done for us by Xen during virtual trap. */
if (!tsk_used_math(tsk))
init_fpu(tsk);
restore_fpu(tsk);
@@ -980,100 +964,58 @@ void __init trap_init_f00f_bug(void)
}
#endif
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
- int __d0, __d1; \
- __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
- "movw %4,%%dx\n\t" \
- "movl %%eax,%0\n\t" \
- "movl %%edx,%1" \
- :"=m" (*((long *) (gate_addr))), \
- "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
- :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
- "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
- _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
- _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
- _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
- _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
- _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
-}
+/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */
+static trap_info_t trap_table[] = {
+ { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
+ { 1, 0, __KERNEL_CS, (unsigned long)debug },
+ { 3, 3, __KERNEL_CS, (unsigned long)int3 },
+ { 4, 3, __KERNEL_CS, (unsigned long)overflow },
+ { 5, 3, __KERNEL_CS, (unsigned long)bounds },
+ { 6, 0, __KERNEL_CS, (unsigned long)invalid_op },
+ { 7, 0, __KERNEL_CS, (unsigned long)device_not_available },
+ { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+ { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS },
+ { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present },
+ { 12, 0, __KERNEL_CS, (unsigned long)stack_segment },
+ { 13, 0, __KERNEL_CS, (unsigned long)general_protection },
+ { 14, 0, __KERNEL_CS, (unsigned long)page_fault },
+ { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment },
+ { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error },
+ { 17, 0, __KERNEL_CS, (unsigned long)alignment_check },
+#ifdef CONFIG_X86_MCE
+ { 18, 0, __KERNEL_CS, (unsigned long)machine_check },
+#endif
+ { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error },
+ { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call },
+ { 0, 0, 0, 0 }
+};
void __init trap_init(void)
{
-#ifdef CONFIG_EISA
- void __iomem *p = ioremap(0x0FFFD9, 4);
- if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
- EISA_bus = 1;
- }
- iounmap(p);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_apic_mappings();
-#endif
-
- set_trap_gate(0,÷_error);
- set_intr_gate(1,&debug);
- set_intr_gate(2,&nmi);
- set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
- set_system_gate(4,&overflow);
- set_system_gate(5,&bounds);
- set_trap_gate(6,&invalid_op);
- set_trap_gate(7,&device_not_available);
- set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
- set_trap_gate(9,&coprocessor_segment_overrun);
- set_trap_gate(10,&invalid_TSS);
- set_trap_gate(11,&segment_not_present);
- set_trap_gate(12,&stack_segment);
- set_trap_gate(13,&general_protection);
- set_intr_gate(14,&page_fault);
- set_trap_gate(15,&spurious_interrupt_bug);
- set_trap_gate(16,&coprocessor_error);
- set_trap_gate(17,&alignment_check);
-#ifdef CONFIG_X86_MCE
- set_trap_gate(18,&machine_check);
-#endif
- set_trap_gate(19,&simd_coprocessor_error);
+ HYPERVISOR_set_trap_table(trap_table);
- set_system_gate(SYSCALL_VECTOR,&system_call);
+ /*
+ * default LDT is a single-entry callgate to lcall7 for iBCS
+ * and a callgate to lcall27 for Solaris/x86 binaries
+ */
+ make_lowmem_page_readonly(&default_ldt[0]);
/*
* Should be a barrier for any external CPU state.
*/
cpu_init();
+}
- trap_init_hook();
+void smp_trap_init(trap_info_t *trap_ctxt)
+{
+ trap_info_t *t = trap_table;
+
+ for (t = trap_table; t->address; t++) {
+ trap_ctxt[t->vector].flags = t->flags;
+ trap_ctxt[t->vector].cs = t->cs;
+ trap_ctxt[t->vector].address = t->address;
+ }
}
static int __init kstack_setup(char *s)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S
linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S
--- pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S 2005-07-28
13:17:07.000000000 -0700
@@ -4,12 +4,12 @@ __INITDATA
.globl vsyscall_int80_start, vsyscall_int80_end
vsyscall_int80_start:
- .incbin "arch/i386/kernel/vsyscall-int80.so"
+ .incbin "arch/xen/i386/kernel/vsyscall-int80.so"
vsyscall_int80_end:
.globl vsyscall_sysenter_start, vsyscall_sysenter_end
vsyscall_sysenter_start:
- .incbin "arch/i386/kernel/vsyscall-sysenter.so"
+ .incbin "arch/xen/i386/kernel/vsyscall-sysenter.so"
vsyscall_sysenter_end:
__FINIT
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mach-default/Makefile
linux-2.6-xen-sparse/arch/i386/mach-default/Makefile
--- pristine-linux-2.6.12/arch/i386/mach-default/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mach-default/Makefile 2005-07-28
13:17:07.000000000 -0700
@@ -2,4 +2,11 @@
# Makefile for the linux kernel.
#
-obj-y := setup.o topology.o
+c-obj-y := topology.o
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y)):
+ @ln -fsn $(srctree)/arch/i386/mach-default/$(notdir $@) $@
+
+obj-y += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/Makefile linux-2.6-xen-sparse/arch/i386/Makefile
--- pristine-linux-2.6.12/arch/i386/Makefile 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/Makefile 2005-07-28 13:17:07.000000000
-0700
@@ -17,15 +17,19 @@
# 20050320 Kianusch Sayah Karadji <kianusch@xxxxxxxxxxx>
# Added support for GEODE CPU
+XENARCH := $(subst ",,$(CONFIG_XENARCH))
+
LDFLAGS := -m elf_i386
-OBJCOPYFLAGS := -O binary -R .note -R .comment -S
LDFLAGS_vmlinux :=
-CHECKFLAGS += -D__i386__
+CHECK := $(CHECK) -D__i386__=1
+
+CFLAGS += -m32
+AFLAGS += -m32
CFLAGS += -pipe -msoft-float
# prevent gcc from keeping the stack 16 byte aligned
-CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2,)
align := $(cc-option-align)
cflags-$(CONFIG_M386) += -march=i386
@@ -59,116 +63,46 @@ cflags-$(CONFIG_MGEODEGX1) += $(call cc
# -mregparm=3 works ok on gcc-3.0 and later
#
-GCC_VERSION := $(call cc-version)
+GCC_VERSION := $(shell $(CONFIG_SHELL)
$(srctree)/scripts/gcc-version.sh $(CC))
cflags-$(CONFIG_REGPARM) += $(shell if [ $(GCC_VERSION) -ge 0300 ] ;
then echo "-mregparm=3"; fi ;)
# Disable unit-at-a-time mode, it makes gcc use a lot more stack
# due to the lack of sharing of stacklots.
-CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+CFLAGS += $(call cc-option,-fno-unit-at-a-time,)
CFLAGS += $(cflags-y)
-# Default subarch .c files
-mcore-y := mach-default
-
-# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-i386/mach-voyager
-mcore-$(CONFIG_X86_VOYAGER) := mach-voyager
-
-# VISWS subarch support
-mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-i386/mach-visws
-mcore-$(CONFIG_X86_VISWS) := mach-visws
-
-# NUMAQ subarch support
-mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-i386/mach-numaq
-mcore-$(CONFIG_X86_NUMAQ) := mach-default
-
-# BIGSMP subarch support
-mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-i386/mach-bigsmp
-mcore-$(CONFIG_X86_BIGSMP) := mach-default
-
-#Summit subarch support
-mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
-mcore-$(CONFIG_X86_SUMMIT) := mach-default
-
-# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
-mcore-$(CONFIG_X86_GENERICARCH) := mach-default
-core-$(CONFIG_X86_GENERICARCH) += arch/i386/mach-generic/
-
-# ES7000 subarch support
-mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
-mcore-$(CONFIG_X86_ES7000) := mach-default
-core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
-
-# default subarch .h files
-mflags-y += -Iinclude/asm-i386/mach-default
-
-head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+head-y := arch/xen/i386/kernel/head.o arch/xen/i386/kernel/init_task.o
libs-y += arch/i386/lib/
-core-y += arch/i386/kernel/ \
- arch/i386/mm/ \
- arch/i386/$(mcore-y)/ \
+core-y += arch/xen/i386/kernel/ \
+ arch/xen/i386/mm/ \
+ arch/xen/i386/mach-default/ \
arch/i386/crypto/
+# \
+# arch/xen/$(mcore-y)/
drivers-$(CONFIG_MATH_EMULATION) += arch/i386/math-emu/
-drivers-$(CONFIG_PCI) += arch/i386/pci/
+drivers-$(CONFIG_PCI) += arch/xen/i386/pci/
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/
drivers-$(CONFIG_PM) += arch/i386/power/
-CFLAGS += $(mflags-y)
-AFLAGS += $(mflags-y)
-
-boot := arch/i386/boot
-
-.PHONY: zImage bzImage compressed zlilo bzlilo \
- zdisk bzdisk fdimage fdimage144 fdimage288 install kernel_install
-
-all: bzImage
-
-# KBUILD_IMAGE specify target image being built
- KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := arch/i386/boot/zImage
+# for clean
+obj- += kernel/ mm/ pci/
+#obj- += ../../i386/lib/ ../../i386/mm/
+#../../i386/$(mcore-y)/
+#obj- += ../../i386/pci/ ../../i386/oprofile/ ../../i386/power/
+
+xenflags-y += -Iinclude/asm-xen/asm-i386/mach-xen \
+ -Iinclude/asm-i386/mach-default
+CFLAGS += $(xenflags-y)
+AFLAGS += $(xenflags-y)
-zImage bzImage: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+prepare: include/asm-$(XENARCH)/asm_offsets.h
+CLEAN_FILES += include/asm-$(XENARCH)/asm_offsets.h
-compressed: zImage
+arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \
+ include/linux/version.h include/config/MARKER
-zlilo bzlilo: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install: vmlinux
-install kernel_install:
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
-
-prepare: include/asm-$(ARCH)/asm_offsets.h
-CLEAN_FILES += include/asm-$(ARCH)/asm_offsets.h
-
-arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
- include/config/MARKER
-
-include/asm-$(ARCH)/asm_offsets.h: arch/$(ARCH)/kernel/asm-offsets.s
+include/asm-$(XENARCH)/asm_offsets.h: arch/$(XENARCH)/kernel/asm-offsets.s
$(call filechk,gen-asm-offsets)
-
-archclean:
- $(Q)$(MAKE) $(clean)=arch/i386/boot
-
-define archhelp
- echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/installkernel or'
- echo ' (distribution) /sbin/installkernel or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' bzdisk - Create a boot floppy in /dev/fd0'
- echo ' fdimage - Create a boot floppy image'
-endef
-
-CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/fault.c
linux-2.6-xen-sparse/arch/i386/mm/fault.c
--- pristine-linux-2.6.12/arch/i386/mm/fault.c 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/fault.c 2005-07-28 13:17:07.000000000
-0700
@@ -21,6 +21,7 @@
#include <linux/vt_kern.h> /* For unblank_screen() */
#include <linux/highmem.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -29,6 +30,8 @@
extern void die(const char *,struct pt_regs *,long);
+DEFINE_PER_CPU(pgd_t *, cur_pgd);
+
/*
* Unlock any spinlocks which will prevent us from getting the
* message out
@@ -77,7 +80,7 @@ static inline unsigned long get_segment_
u32 seg_ar, seg_limit, base, *desc;
/* The standard kernel/user address space limit. */
- *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+ *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
/* Unlikely, but must come before segment checks. */
if (unlikely((regs->eflags & VM_MASK) != 0))
@@ -107,7 +110,7 @@ static inline unsigned long get_segment_
desc = (void *)desc + (seg & ~7);
} else {
/* Must disable preemption while reading the GDT. */
- desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu());
+ desc = (u32 *)get_cpu_gdt_table(get_cpu());
desc = (void *)desc + (seg & ~7);
}
@@ -211,25 +214,30 @@ fastcall void do_invalid_op(struct pt_re
* bit 1 == 0 means read, 1 means write
* bit 2 == 0 means kernel, 1 means user-mode
*/
-fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
- unsigned long address;
unsigned long page;
int write;
siginfo_t info;
- /* get the address */
- __asm__("movl %%cr2,%0":"=r" (address));
+ /* Set the "privileged fault" bit to something sane. */
+ error_code &= 3;
+ error_code |= (regs->xcs & 2) << 1;
+ if (regs->eflags & X86_EFLAGS_VM)
+ error_code |= 4;
if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;
+#if 0
/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
local_irq_enable();
+#endif
tsk = current;
@@ -446,9 +454,10 @@ no_context:
printk(" at virtual address %08lx\n",address);
printk(KERN_ALERT " printing eip:\n");
printk("%08lx\n", regs->eip);
- asm("movl %%cr3,%0":"=r" (page));
- page = ((unsigned long *) __va(page))[address >> 22];
- printk(KERN_ALERT "*pde = %08lx\n", page);
+ page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id()))
+ [address >> 22];
+ printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
+ machine_to_phys(page));
/*
* We must not directly access the pte in the highpte
* case, the page table might be allocated in highmem.
@@ -459,8 +468,10 @@ no_context:
if (page & 1) {
page &= PAGE_MASK;
address &= 0x003ff000;
+ page = machine_to_phys(page);
page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
- printk(KERN_ALERT "*pte = %08lx\n", page);
+ printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
+ machine_to_phys(page));
}
#endif
die("Oops", regs, error_code);
@@ -514,14 +525,12 @@ vmalloc_fault:
* an interrupt in the middle of a task switch..
*/
int index = pgd_index(address);
- unsigned long pgd_paddr;
pgd_t *pgd, *pgd_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pte_t *pte_k;
- asm("movl %%cr3,%0":"=r" (pgd_paddr));
- pgd = index + (pgd_t *)__va(pgd_paddr);
+ pgd = index + per_cpu(cur_pgd, smp_processor_id());
pgd_k = init_mm.pgd + index;
if (!pgd_present(*pgd_k))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/highmem.c
linux-2.6-xen-sparse/arch/i386/mm/highmem.c
--- pristine-linux-2.6.12/arch/i386/mm/highmem.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/highmem.c 2005-07-28 13:17:07.000000000
-0700
@@ -25,7 +25,7 @@ void kunmap(struct page *page)
* However when holding an atomic kmap is is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
-void *kmap_atomic(struct page *page, enum km_type type)
+static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
@@ -41,12 +41,23 @@ void *kmap_atomic(struct page *page, enu
if (!pte_none(*(kmap_pte-idx)))
BUG();
#endif
- set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
__flush_tlb_one(vaddr);
return (void*) vaddr;
}
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+ return __kmap_atomic(page, type, kmap_prot);
+}
+
+/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
+void *kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ return __kmap_atomic(page, type, PAGE_KERNEL_RO);
+}
+
void kunmap_atomic(void *kvaddr, enum km_type type)
{
#ifdef CONFIG_DEBUG_HIGHMEM
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/hypervisor.c
linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c
--- pristine-linux-2.6.12/arch/i386/mm/hypervisor.c 1969-12-31
16:00:00.000000000 -0800
+++ linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c 2005-07-28
13:17:07.000000000 -0700
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * mm/hypervisor.c
+ *
+ * Update page tables via the hypervisor.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/balloon.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#define pte_offset_kernel pte_offset
+#define pud_t pgd_t
+#define pud_offset(d, va) d
+#elif defined(CONFIG_X86_64)
+#define pmd_val_ma(v) (v).pmd
+#else
+#ifdef CONFIG_X86_PAE
+# define pmd_val_ma(v) ((v).pmd)
+# define pud_val_ma(v) ((v).pgd.pgd)
+#else
+# define pmd_val_ma(v) ((v).pud.pgd.pgd)
+#endif
+#endif
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = pte_val_ma(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = pmd_val_ma(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = pud_val_ma(val);
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = val.pud;
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+ mmu_update_t u;
+ u.ptr = virt_to_machine(ptr);
+ u.val = val.pgd;
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_machphys_update(unsigned long mfn, unsigned long pfn)
+{
+ mmu_update_t u;
+ u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+ u.val = pfn;
+ BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pt_switch(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_NEW_BASEPTR;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_NEW_USER_BASEPTR;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush(void)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_INVLPG_LOCAL;
+ op.linear_addr = ptr & PAGE_MASK;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_TLB_FLUSH_ALL;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush_mask(cpumask_t *mask)
+{
+ struct mmuext_op op;
+ if ( cpus_empty(*mask) )
+ return;
+ op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ op.vcpumask = mask->bits;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_all(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_INVLPG_ALL;
+ op.linear_addr = ptr & PAGE_MASK;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
+{
+ struct mmuext_op op;
+ if ( cpus_empty(*mask) )
+ return;
+ op.cmd = MMUEXT_INVLPG_MULTI;
+ op.vcpumask = mask->bits;
+ op.linear_addr = ptr & PAGE_MASK;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#endif /* CONFIG_SMP */
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_pgd_pin(unsigned long ptr)
+{
+ struct mmuext_op op;
+#ifdef CONFIG_X86_64
+ op.cmd = MMUEXT_PIN_L4_TABLE;
+#elif defined(CONFIG_X86_PAE)
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pgd_unpin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_UNPIN_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_pin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_PIN_L1_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_unpin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_UNPIN_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_64
+void xen_pud_pin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pud_unpin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_UNPIN_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_pin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_PIN_L2_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_unpin(unsigned long ptr)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_UNPIN_TABLE;
+ op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_set_ldt(unsigned long ptr, unsigned long len)
+{
+ struct mmuext_op op;
+ op.cmd = MMUEXT_SET_LDT;
+ op.linear_addr = ptr;
+ op.nr_ents = len;
+ BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_contig_memory(unsigned long vstart, unsigned int order)
+{
+ /*
+ * Ensure multi-page extents are contiguous in machine memory. This code
+ * could be cleaned up some, and the number of hypercalls reduced.
+ */
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long mfn, i, flags;
+
+ scrub_pages(vstart, 1 << order);
+
+ balloon_lock(flags);
+
+ /* 1. Zap current PTEs, giving away the underlying pages. */
+ for (i = 0; i < (1<<order); i++) {
+ pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
+ pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+ pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+ pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+ mfn = pte_mfn(*pte);
+ HYPERVISOR_update_va_mapping(
+ vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+ phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+ INVALID_P2M_ENTRY;
+ BUG_ON(HYPERVISOR_dom_mem_op(
+ MEMOP_decrease_reservation, &mfn, 1, 0) != 1);
+ }
+
+ /* 2. Get a new contiguous memory extent. */
+ BUG_ON(HYPERVISOR_dom_mem_op(
+ MEMOP_increase_reservation, &mfn, 1, order) != 1);
+
+ /* 3. Map the new extent in place of old pages. */
+ for (i = 0; i < (1<<order); i++) {
+ HYPERVISOR_update_va_mapping(
+ vstart + (i*PAGE_SIZE),
+ __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0);
+ xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i);
+ phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i;
+ }
+
+ flush_tlb_all();
+
+ balloon_unlock(flags);
+}
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long *pfn_array;
+ unsigned long vstart;
+ unsigned long i;
+ unsigned int order = get_order(pages*PAGE_SIZE);
+
+ vstart = __get_free_pages(GFP_KERNEL, order);
+ if ( vstart == 0 )
+ return 0UL;
+
+ scrub_pages(vstart, 1 << order);
+
+ pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+ if ( pfn_array == NULL )
+ BUG();
+
+ for ( i = 0; i < (1<<order); i++ )
+ {
+ pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE)));
+ pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+ pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+ pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+ pfn_array[i] = pte_mfn(*pte);
+#ifdef CONFIG_X86_64
+ xen_l1_entry_update(pte, __pte(0));
+#else
+ HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+#endif
+ phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+ INVALID_P2M_ENTRY;
+ }
+
+ flush_tlb_all();
+
+ balloon_put_pages(pfn_array, 1 << order);
+
+ vfree(pfn_array);
+
+ return vstart;
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/init.c
linux-2.6-xen-sparse/arch/i386/mm/init.c
--- pristine-linux-2.6.12/arch/i386/mm/init.c 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/init.c 2005-07-28 13:17:07.000000000
-0700
@@ -39,6 +39,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm-xen/hypervisor.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -56,9 +57,10 @@ static pmd_t * __init one_md_table_init(
{
pud_t *pud;
pmd_t *pmd_table;
-
+
#ifdef CONFIG_X86_PAE
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+ make_page_readonly(pmd_table);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
if (pmd_table != pmd_offset(pud, 0))
@@ -79,6 +81,7 @@ static pte_t * __init one_page_table_ini
{
if (pmd_none(*pmd)) {
pte_t *page_table = (pte_t *)
alloc_bootmem_low_pages(PAGE_SIZE);
+ make_page_readonly(page_table);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
if (page_table != pte_offset_kernel(pmd, 0))
BUG();
@@ -119,7 +122,7 @@ static void __init page_table_range_init
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++,
pmd_idx++) {
- if (pmd_none(*pmd))
+ if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd))
one_page_table_init(pmd);
vaddr += PMD_SIZE;
@@ -148,16 +151,36 @@ static void __init kernel_physical_mappi
pte_t *pte;
int pgd_idx, pmd_idx, pte_ofs;
+ unsigned long max_ram_pfn = xen_start_info.nr_pages;
+ if (max_ram_pfn > max_low_pfn)
+ max_ram_pfn = max_low_pfn;
+
pgd_idx = pgd_index(PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
pfn = 0;
+ pmd_idx = pmd_index(PAGE_OFFSET);
+ pte_ofs = pte_index(PAGE_OFFSET);
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+ /*
+ * Native linux hasn't PAE-paging enabled yet at this
+ * point. When running as xen domain we are in PAE
+ * mode already, thus we can't simply hook a empty
+ * pmd. That would kill the mappings we are currently
+ * using ...
+ */
+ pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
pmd = one_md_table_init(pgd);
+#endif
if (pfn >= max_low_pfn)
continue;
- for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
pmd++, pmd_idx++) {
+ pmd += pmd_idx;
+ for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++,
pmd_idx++) {
unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+ if (address >= HYPERVISOR_VIRT_START)
+ continue;
/* Map with big pages if possible, otherwise create
normal page tables. */
if (cpu_has_pse) {
@@ -171,14 +194,20 @@ static void __init kernel_physical_mappi
} else {
pte = one_page_table_init(pmd);
- for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn
< max_low_pfn; pte++, pfn++, pte_ofs++) {
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn <
max_low_pfn; pte++, pfn++, pte_ofs++) {
+ /* XEN: Only map initial RAM
allocation. */
+ if ((pfn >= max_ram_pfn) ||
pte_present(*pte))
+ continue;
if (is_kernel_text(address))
set_pte(pte,
pfn_pte(pfn, PAGE_KERNEL_EXEC));
else
set_pte(pte,
pfn_pte(pfn, PAGE_KERNEL));
}
+ pte_ofs = 0;
}
}
+ pmd_idx = 0;
}
}
@@ -271,7 +300,8 @@ void __init one_highpage_init(struct pag
ClearPageReserved(page);
set_bit(PG_highmem, &page->flags);
set_page_count(page, 1);
- __free_page(page);
+ if (pfn < xen_start_info.nr_pages)
+ __free_page(page);
totalhigh_pages++;
} else
SetPageReserved(page);
@@ -308,6 +338,7 @@ static void __init pagetable_init (void)
{
unsigned long vaddr;
pgd_t *pgd_base = swapper_pg_dir;
+ pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base;
#ifdef CONFIG_X86_PAE
int i;
@@ -328,6 +359,45 @@ static void __init pagetable_init (void)
__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
}
+ /*
+ * Switch to proper mm_init page directory. Initialise from the current
+ * page directory, write-protect the new page directory, then switch to
+ * it. We clean up by write-enabling and then freeing the old page dir.
+ */
+#ifndef CONFIG_X86_PAE
+ memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+ make_page_readonly(pgd_base);
+ xen_pgd_pin(__pa(pgd_base));
+ load_cr3(pgd_base);
+ xen_pgd_unpin(__pa(old_pgd));
+ make_page_writable(old_pgd);
+ __flush_tlb_all();
+ free_bootmem(__pa(old_pgd), PAGE_SIZE);
+#else
+ {
+ pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET);
+ pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET);
+ pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE);
+
+ memcpy(new_pmd, old_pmd, PAGE_SIZE);
+ memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+ set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT));
+
+ make_page_readonly(new_pmd);
+ make_page_readonly(pgd_base);
+ xen_pgd_pin(__pa(pgd_base));
+ load_cr3(pgd_base);
+ xen_pgd_unpin(__pa(old_pgd));
+ make_page_writable(old_pgd);
+ make_page_writable(old_pmd);
+ __flush_tlb_all();
+
+ free_bootmem(__pa(old_pgd), PAGE_SIZE);
+ free_bootmem(__pa(old_pmd), PAGE_SIZE);
+ }
+#endif
+
+ init_mm.context.pinned = 1;
kernel_physical_mapping_init(pgd_base);
remap_numa_kva();
@@ -340,7 +410,7 @@ static void __init pagetable_init (void)
permanent_kmaps_init(pgd_base);
-#ifdef CONFIG_X86_PAE
+#if 0 /* def CONFIG_X86_PAE */
/*
* Add low memory identity-mappings - SMP needs it when
* starting up on an AP from real-mode. In the non-PAE
@@ -348,7 +418,7 @@ static void __init pagetable_init (void)
* All user-space mappings are explicitly cleared after
* SMP startup.
*/
- pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
+ set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
#endif
}
@@ -383,7 +453,7 @@ void zap_low_mappings (void)
* us, because pgd_clear() is a no-op on i386.
*/
for (i = 0; i < USER_PTRS_PER_PGD; i++)
-#ifdef CONFIG_X86_PAE
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
set_pgd(swapper_pg_dir+i, __pgd(0));
@@ -470,6 +540,10 @@ out:
*/
void __init paging_init(void)
{
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+ int i;
+#endif
+
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
@@ -478,12 +552,12 @@ void __init paging_init(void)
pagetable_init();
- load_cr3(swapper_pg_dir);
-
-#ifdef CONFIG_X86_PAE
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
/*
* We will bail out later - printk doesn't work right now so
* the user would just see a hanging kernel.
+ * when running as xen domain we are already in PAE mode at
+ * this point.
*/
if (cpu_has_pae)
set_in_cr4(X86_CR4_PAE);
@@ -491,6 +565,22 @@ void __init paging_init(void)
__flush_tlb_all();
kmap_init();
+
+ /* Switch to the real shared_info page, and clear the dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+ /* Setup mapping of lower 1st MB */
+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
+ if (xen_start_info.flags & SIF_PRIVILEGED)
+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+ else
+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
+ virt_to_machine(empty_zero_page),
+ PAGE_KERNEL_RO);
+#endif
}
/*
@@ -539,6 +629,7 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize;
int tmp;
int bad_ppro;
+ unsigned long pfn;
#ifndef CONFIG_DISCONTIGMEM
if (!mem_map)
@@ -564,9 +655,18 @@ void __init mem_init(void)
#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
-
+ printk("vmalloc area: %lx-%lx, maxmem %lx\n",
+ VMALLOC_START,VMALLOC_END,MAXMEM);
+ BUG_ON(VMALLOC_START > VMALLOC_END);
+
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
+ /* XEN: init and count low-mem pages outside initial allocation. */
+ for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) {
+ ClearPageReserved(&mem_map[pfn]);
+ set_page_count(&mem_map[pfn], 1);
+ totalram_pages++;
+ }
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
@@ -630,11 +730,16 @@ void __init pgtable_cache_init(void)
panic("pgtable_cache_init(): cannot create pmd cache");
}
pgd_cache = kmem_cache_create("pgd",
+#if 0 /* How the heck _this_ works in native linux ??? */
PTRS_PER_PGD*sizeof(pgd_t),
PTRS_PER_PGD*sizeof(pgd_t),
+#else
+ PAGE_SIZE,
+ PAGE_SIZE,
+#endif
0,
pgd_ctor,
- PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+ pgd_dtor);
if (!pgd_cache)
panic("pgtable_cache_init(): Cannot create pgd cache");
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/ioremap.c
linux-2.6-xen-sparse/arch/i386/mm/ioremap.c
--- pristine-linux-2.6.12/arch/i386/mm/ioremap.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/ioremap.c 2005-07-28 13:17:07.000000000
-0700
@@ -11,91 +11,54 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
+#ifndef CONFIG_XEN_PHYSDEV_ACCESS
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, unsigned long phys_addr, unsigned long flags)
+void * __ioremap(unsigned long phys_addr, unsigned long size,
+ unsigned long flags)
{
- pte_t *pte;
- unsigned long pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel(&init_mm, pmd, addr);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
- _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- return 0;
+ return NULL;
}
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, unsigned long phys_addr, unsigned long flags)
+void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
{
- pmd_t *pmd;
- unsigned long next;
-
- phys_addr -= addr;
- pmd = pmd_alloc(&init_mm, pud, addr);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
- if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags))
- return -ENOMEM;
- } while (pmd++, addr = next, addr != end);
- return 0;
+ return NULL;
}
-static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, unsigned long phys_addr, unsigned long flags)
+void iounmap(volatile void __iomem *addr)
{
- pud_t *pud;
- unsigned long next;
+}
- phys_addr -= addr;
- pud = pud_alloc(&init_mm, pgd, addr);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
- if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags))
- return -ENOMEM;
- } while (pud++, addr = next, addr != end);
- return 0;
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+ return NULL;
}
-static int ioremap_page_range(unsigned long addr,
- unsigned long end, unsigned long phys_addr, unsigned long flags)
+void __init bt_iounmap(void *addr, unsigned long size)
{
- pgd_t *pgd;
- unsigned long next;
- int err;
+}
- BUG_ON(addr >= end);
- flush_cache_all();
- phys_addr -= addr;
- pgd = pgd_offset_k(addr);
- spin_lock(&init_mm.page_table_lock);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
- if (err)
- break;
- } while (pgd++, addr = next, addr != end);
- spin_unlock(&init_mm.page_table_lock);
- flush_tlb_all();
- return err;
+#else
+
+/*
+ * Does @address reside within a non-highmem page that is local to this virtual
+ * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
+ * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand
+ * why this works.
+ */
+static inline int is_local_lowmem(unsigned long address)
+{
+ extern unsigned long max_low_pfn;
+ unsigned long mfn = address >> PAGE_SHIFT;
+ unsigned long pfn = mfn_to_pfn(mfn);
+ return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn));
}
/*
@@ -116,31 +79,36 @@ void __iomem * __ioremap(unsigned long p
void __iomem * addr;
struct vm_struct * area;
unsigned long offset, last_addr;
+ domid_t domid = DOMID_IO;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
- return (void __iomem *) phys_to_virt(phys_addr);
+ if (phys_addr >= 0x0 && last_addr < 0x100000)
+ return isa_bus_to_virt(phys_addr);
+#endif
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- if (phys_addr <= virt_to_phys(high_memory - 1)) {
+ if (is_local_lowmem(phys_addr)) {
char *t_addr, *t_end;
struct page *page;
- t_addr = __va(phys_addr);
+ t_addr = bus_to_virt(phys_addr);
t_end = t_addr + (size - 1);
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end);
page++)
if(!PageReserved(page))
return NULL;
+
+ domid = DOMID_SELF;
}
/*
@@ -158,8 +126,10 @@ void __iomem * __ioremap(unsigned long p
return NULL;
area->phys_addr = phys_addr;
addr = (void __iomem *) area->addr;
- if (ioremap_page_range((unsigned long) addr,
- (unsigned long) addr + size, phys_addr, flags)) {
+ if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr,
+ size, __pgprot(_PAGE_PRESENT | _PAGE_RW |
+ _PAGE_DIRTY | _PAGE_ACCESSED
+ | flags), domid)) {
vunmap((void __force *) addr);
return NULL;
}
@@ -199,8 +169,8 @@ void __iomem *ioremap_nocache (unsigned
/* Guaranteed to be > phys_addr, as per __ioremap() */
last_addr = phys_addr + size - 1;
- if (last_addr < virt_to_phys(high_memory) - 1) {
- struct page *ppage = virt_to_page(__va(phys_addr));
+ if (is_local_lowmem(last_addr)) {
+ struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
unsigned long npages;
phys_addr &= PAGE_MASK;
@@ -227,32 +197,24 @@ void iounmap(volatile void __iomem *addr
{
struct vm_struct *p;
if ((void __force *) addr <= high_memory)
+ return;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
return;
-
- /*
- * __ioremap special-cases the PCI/ISA range by not instantiating a
- * vm_area and by simply returning an address into the kernel mapping
- * of ISA space. So handle that here.
- */
- if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
- addr < phys_to_virt(ISA_END_ADDRESS))
- return;
-
- write_lock(&vmlist_lock);
- p = __remove_vm_area((void *) (PAGE_MASK & (unsigned long __force)
addr));
+#endif
+ p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr));
if (!p) {
- printk("iounmap: bad address %p\n", addr);
- goto out_unlock;
+ printk("__iounmap: bad address %p\n", addr);
+ return;
}
- if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
- change_page_attr(virt_to_page(__va(p->phys_addr)),
- p->size >> PAGE_SHIFT,
- PAGE_KERNEL);
+ if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
+ /* p->size includes the guard page, but cpa doesn't like that */
+ change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
+ (p->size - PAGE_SIZE) >> PAGE_SHIFT,
+ PAGE_KERNEL);
global_flush_tlb();
}
-out_unlock:
- write_unlock(&vmlist_lock);
kfree(p);
}
@@ -267,11 +229,13 @@ void __init *bt_ioremap(unsigned long ph
if (!size || last_addr < phys_addr)
return NULL;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
- return phys_to_virt(phys_addr);
+ if (phys_addr >= 0x0 && last_addr < 0x100000)
+ return isa_bus_to_virt(phys_addr);
+#endif
/*
* Mappings have to be page-aligned
@@ -310,6 +274,10 @@ void __init bt_iounmap(void *addr, unsig
virt_addr = (unsigned long)addr;
if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
return;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+ return;
+#endif
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
@@ -320,3 +288,155 @@ void __init bt_iounmap(void *addr, unsig
--nrpages;
}
}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
+
+/* These hacky macros avoid phys->machine translations. */
+#define __direct_pte(x) ((pte_t) { (x) } )
+#define __direct_mk_pte(page_nr,pgprot) \
+ __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
+#define direct_mk_pte_phys(physpage, pgprot) \
+ __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
+
+static inline void direct_remap_area_pte(pte_t *pte,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t **v)
+{
+ unsigned long end;
+
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end > PMD_SIZE)
+ end = PMD_SIZE;
+ if (address >= end)
+ BUG();
+
+ do {
+ (*v)->ptr = virt_to_machine(pte);
+ (*v)++;
+ address += PAGE_SIZE;
+ pte++;
+ } while (address && (address < end));
+}
+
+static inline int direct_remap_area_pmd(struct mm_struct *mm,
+ pmd_t *pmd,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t **v)
+{
+ unsigned long end;
+
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ if (address >= end)
+ BUG();
+ do {
+ pte_t *pte = (mm == &init_mm) ?
+ pte_alloc_kernel(mm, pmd, address) :
+ pte_alloc_map(mm, pmd, address);
+ if (!pte)
+ return -ENOMEM;
+ direct_remap_area_pte(pte, address, end - address, v);
+ pte_unmap(pte);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address && (address < end));
+ return 0;
+}
+
+int __direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t *v)
+{
+ pgd_t * dir;
+ unsigned long end = address + size;
+ int error;
+
+ dir = pgd_offset(mm, address);
+ if (address >= end)
+ BUG();
+ spin_lock(&mm->page_table_lock);
+ do {
+ pud_t *pud;
+ pmd_t *pmd;
+
+ error = -ENOMEM;
+ pud = pud_alloc(mm, dir, address);
+ if (!pud)
+ break;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ break;
+ error = 0;
+ direct_remap_area_pmd(mm, pmd, address, end - address, &v);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
+
+ } while (address && (address < end));
+ spin_unlock(&mm->page_table_lock);
+ return error;
+}
+
+
+int direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long machine_addr,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid)
+{
+ int i;
+ unsigned long start_address;
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+ mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u;
+
+ start_address = address;
+
+ flush_cache_all();
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) {
+ /* Fill in the PTE pointers. */
+ __direct_remap_area_pages(mm,
+ start_address,
+ address-start_address,
+ u);
+
+ if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+ return -EFAULT;
+ v = u;
+ start_address = address;
+ }
+
+ /*
+ * Fill in the machine address: PTE ptr is done later by
+ * __direct_remap_area_pages().
+ */
+ v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot);
+
+ machine_addr += PAGE_SIZE;
+ address += PAGE_SIZE;
+ v++;
+ }
+
+ if (v != u) {
+ /* get the ptep's filled in */
+ __direct_remap_area_pages(mm,
+ start_address,
+ address-start_address,
+ u);
+ if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+ return -EFAULT;
+ }
+
+ flush_tlb_all();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(direct_remap_area_pages);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/Makefile
linux-2.6-xen-sparse/arch/i386/mm/Makefile
--- pristine-linux-2.6.12/arch/i386/mm/Makefile 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/Makefile 2005-07-28 13:17:07.000000000
-0700
@@ -2,9 +2,23 @@
# Makefile for the linux i386-specific parts of the memory manager.
#
-obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o
+XENARCH := $(subst ",,$(CONFIG_XENARCH))
-obj-$(CONFIG_DISCONTIGMEM) += discontig.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+CFLAGS += -Iarch/$(XENARCH)/mm
+
+obj-y := init.o pgtable.o fault.o ioremap.o hypervisor.o
+c-obj-y := extable.o mmap.o pageattr.o
+
+c-obj-$(CONFIG_DISCONTIGMEM) += discontig.o
+c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_HIGHMEM) += highmem.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+ @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@
+
+obj-y += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/mm/pgtable.c
linux-2.6-xen-sparse/arch/i386/mm/pgtable.c
--- pristine-linux-2.6.12/arch/i386/mm/pgtable.c 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/pgtable.c 2005-07-28 13:17:07.000000000
-0700
@@ -21,6 +21,10 @@
#include <asm/e820.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <asm-xen/foreign_page.h>
void show_mem(void)
{
@@ -93,6 +97,44 @@ static void set_pte_pfn(unsigned long va
}
/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
+ pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ /* <pfn,flags> stored as-is, to permit clearing entries */
+ set_pte(pte, pfn_pte_ma(pfn, flags));
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame. pfn is for the base of the page,
* vaddr is what the page gets mapped to - both must be properly aligned.
@@ -135,12 +177,26 @@ void __set_fixmap (enum fixed_addresses
BUG();
return;
}
- set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+ switch (idx) {
+ case FIX_WP_TEST:
+ case FIX_VSYSCALL:
+#ifdef CONFIG_X86_F00F_BUG
+ case FIX_F00F_IDT:
+#endif
+ set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+ break;
+ default:
+ set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
+ break;
+ }
}
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ pte_t *pte = (pte_t
*)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ if (pte)
+ make_page_readonly(pte);
+ return pte;
}
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -151,10 +207,29 @@ struct page *pte_alloc_one(struct mm_str
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
#else
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ if (pte) {
+ SetPageForeign(pte, pte_free);
+ set_page_count(pte, 1);
+ }
#endif
+
return pte;
}
+void pte_free(struct page *pte)
+{
+ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+ if (!pte_write(*virt_to_ptep(va)))
+ HYPERVISOR_update_va_mapping(
+ va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
+
+ ClearPageForeign(pte);
+ set_page_count(pte, 1);
+
+ __free_page(pte);
+}
+
void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
{
memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
@@ -199,14 +274,14 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
{
unsigned long flags;
- if (PTRS_PER_PMD == 1)
+ if (!HAVE_SHARED_KERNEL_PMD)
spin_lock_irqsave(&pgd_lock, flags);
memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
- if (PTRS_PER_PMD > 1)
+ if (HAVE_SHARED_KERNEL_PMD)
return;
pgd_list_add(pgd);
@@ -214,11 +289,13 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
}
-/* never called when PTRS_PER_PMD > 1 */
void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
{
unsigned long flags; /* can be called from interrupt context */
+ if (HAVE_SHARED_KERNEL_PMD)
+ return;
+
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -226,12 +303,30 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- int i;
+ int i = 0;
pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
if (PTRS_PER_PMD == 1 || !pgd)
return pgd;
+ if (!HAVE_SHARED_KERNEL_PMD) {
+ /* alloc and copy kernel pmd */
+ unsigned long flags;
+ pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET);
+ pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET);
+ pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET);
+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+ if (0 == pmd)
+ goto out_oom;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ memcpy(pmd, copy_pmd, PAGE_SIZE);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ make_page_readonly(pmd);
+ set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
+ }
+
+ /* alloc user pmds */
for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
if (!pmd)
@@ -250,11 +345,207 @@ out_oom:
void pgd_free(pgd_t *pgd)
{
int i;
+ pte_t *ptep = virt_to_ptep(pgd);
+
+ if (!pte_write(*ptep)) {
+ xen_pgd_unpin(__pa(pgd));
+ HYPERVISOR_update_va_mapping(
+ (unsigned long)pgd,
+ pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
+ 0);
+ }
/* in the PAE case user pgd entries are overwritten before usage */
- if (PTRS_PER_PMD > 1)
- for (i = 0; i < USER_PTRS_PER_PGD; ++i)
- kmem_cache_free(pmd_cache, (void
*)__va(pgd_val(pgd[i])-1));
+ if (PTRS_PER_PMD > 1) {
+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
+ kmem_cache_free(pmd_cache, pmd);
+ }
+ if (!HAVE_SHARED_KERNEL_PMD) {
+ pmd_t *pmd = (void
*)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1);
+ make_page_writable(pmd);
+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+ kmem_cache_free(pmd_cache, pmd);
+ }
+ }
/* in the non-PAE case, free_pgtables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);
}
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void make_lowmem_page_readonly(void *va)
+{
+ pte_t *pte = virt_to_ptep(va);
+ set_pte(pte, pte_wrprotect(*pte));
+}
+
+void make_lowmem_page_writable(void *va)
+{
+ pte_t *pte = virt_to_ptep(va);
+ set_pte(pte, pte_mkwrite(*pte));
+}
+
+void make_page_readonly(void *va)
+{
+ pte_t *pte = virt_to_ptep(va);
+ set_pte(pte, pte_wrprotect(*pte));
+ if ( (unsigned long)va >= (unsigned long)high_memory )
+ {
+ unsigned long phys;
+ phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+ if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+ make_lowmem_page_readonly(phys_to_virt(phys));
+ }
+}
+
+void make_page_writable(void *va)
+{
+ pte_t *pte = virt_to_ptep(va);
+ set_pte(pte, pte_mkwrite(*pte));
+ if ( (unsigned long)va >= (unsigned long)high_memory )
+ {
+ unsigned long phys;
+ phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+ if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+ make_lowmem_page_writable(phys_to_virt(phys));
+ }
+}
+
+void make_pages_readonly(void *va, unsigned int nr)
+{
+ while ( nr-- != 0 )
+ {
+ make_page_readonly(va);
+ va = (void *)((unsigned long)va + PAGE_SIZE);
+ }
+}
+
+void make_pages_writable(void *va, unsigned int nr)
+{
+ while ( nr-- != 0 )
+ {
+ make_page_writable(va);
+ va = (void *)((unsigned long)va + PAGE_SIZE);
+ }
+}
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+ struct page *page = virt_to_page(pt);
+ unsigned long pfn = page_to_pfn(page);
+
+ if (PageHighMem(page))
+ return;
+ HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(pfn, flags), 0);
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int g,u,m;
+
+ pgd = mm->pgd;
+ for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ if (PTRS_PER_PUD > 1) /* not folded */
+ mm_walk_set_prot(pud,flags);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ if (PTRS_PER_PMD > 1) /* not folded */
+ mm_walk_set_prot(pmd,flags);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ if (pmd_none(*pmd))
+ continue;
+ pte = pte_offset_kernel(pmd,0);
+ mm_walk_set_prot(pte,flags);
+ }
+ }
+ }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+
+ mm_walk(mm, PAGE_KERNEL_RO);
+ HYPERVISOR_update_va_mapping(
+ (unsigned long)mm->pgd,
+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
+ UVMF_TLB_FLUSH);
+ xen_pgd_pin(__pa(mm->pgd));
+ mm->context.pinned = 1;
+ spin_lock(&mm_unpinned_lock);
+ list_del(&mm->context.unpinned);
+ spin_unlock(&mm_unpinned_lock);
+
+ spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+
+ xen_pgd_unpin(__pa(mm->pgd));
+ HYPERVISOR_update_va_mapping(
+ (unsigned long)mm->pgd,
+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
+ mm_walk(mm, PAGE_KERNEL);
+ xen_tlb_flush();
+ mm->context.pinned = 0;
+ spin_lock(&mm_unpinned_lock);
+ list_add(&mm->context.unpinned, &mm_unpinned);
+ spin_unlock(&mm_unpinned_lock);
+
+ spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+ while (!list_empty(&mm_unpinned))
+ mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+ context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+
+ /*
+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+ */
+ if ( tsk->active_mm == mm )
+ {
+ tsk->active_mm = &init_mm;
+ atomic_inc(&init_mm.mm_count);
+
+ switch_mm(mm, &init_mm, tsk);
+
+ atomic_dec(&mm->mm_count);
+ BUG_ON(atomic_read(&mm->mm_count) == 0);
+ }
+
+ task_unlock(tsk);
+
+ if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+ mm_unpin(mm);
+}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/pci/irq.c
linux-2.6-xen-sparse/arch/i386/pci/irq.c
--- pristine-linux-2.6.12/arch/i386/pci/irq.c 2005-06-17 12:48:29.000000000
-0700
+++ linux-2.6-xen-sparse/arch/i386/pci/irq.c 2005-07-28 13:17:07.000000000
-0700
@@ -68,7 +68,8 @@ static struct irq_routing_table * __init
int i;
u8 sum;
- for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr +=
16) {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+ for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *)
isa_bus_to_virt(0x100000); addr += 16) {
rt = (struct irq_routing_table *) addr;
if (rt->signature != PIRQ_SIGNATURE ||
rt->version != PIRQ_VERSION ||
@@ -83,6 +84,8 @@ static struct irq_routing_table * __init
return rt;
}
}
+#endif
+
return NULL;
}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp
pristine-linux-2.6.12/arch/i386/pci/Makefile
linux-2.6-xen-sparse/arch/i386/pci/Makefile
--- pristine-linux-2.6.12/arch/i386/pci/Makefile 2005-06-17
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/pci/Makefile 2005-07-28 13:17:07.000000000
-0700
@@ -1,14 +1,32 @@
-obj-y := i386.o
+XENARCH := $(subst ",,$(CONFIG_XENARCH))
-obj-$(CONFIG_PCI_BIOS) += pcbios.o
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
-obj-$(CONFIG_PCI_DIRECT) += direct.o
+CFLAGS += -Iarch/$(XENARCH)/pci
-pci-y := fixup.o
-pci-$(CONFIG_ACPI_PCI) += acpi.o
-pci-y += legacy.o irq.o
+c-obj-y := i386.o
-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o
+c-obj-$(CONFIG_PCI_BIOS) += pcbios.o
+c-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
+c-obj-$(CONFIG_PCI_DIRECT) += direct.o
-obj-y += $(pci-y) common.o
+c-pci-y := fixup.o
+c-pci-$(CONFIG_ACPI_PCI) += acpi.o
+c-pci-y += legacy.o
+# Make sure irq.o gets linked in after legacy.o
+l-pci-y += irq.o
+
+c-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o
+pci-$(CONFIG_X86_VISWS) :=
+c-pci-$(CONFIG_X86_NUMAQ) := numa.o
+pci-$(CONFIG_X86_NUMAQ) := irq.o
+
+obj-y += $(pci-y)
+c-obj-y += $(c-pci-y) common.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+ @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@
+
+obj-y += $(c-obj-y) $(l-pci-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
_______________________________________________
Xen-merge mailing list
Xen-merge@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-merge
|