[PATCH 03/04] Kexec / Kdump: x86_32 specific code
This patch contains the x86_32 implementation of Kexec / Kdump for Xen.
Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---
Applies on top of xen-unstable-11760.
buildconfigs/linux-defconfig_xen_x86_32 | 2
linux-2.6-xen-sparse/arch/i386/Kconfig | 2
linux-2.6-xen-sparse/arch/i386/kernel/Makefile | 2
linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c | 25
linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h | 8
patches/linux-2.6.16.29/series | 3
linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h | 57 +
patches/linux-2.6.16.29/git-35..cc9.patch | 401 +++++++
patches/linux-2.6.16.29/linux-2.6.19-rc1-kexe..code-i386.patch | 169 ++++
patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch | 54 +
xen/arch/x86/crash.c | 47 +
xen/arch/x86/x86_32/entry.S | 2
xen/arch/x86/x86_32/machine_kexec.c | 25
xen/include/asm-x86/x86_32/elf.h | 32
xen/include/asm-x86/x86_32/kexec.h | 65 +
15 files changed, 863 insertions(+), 31 deletions(-)
--- 0002/buildconfigs/linux-defconfig_xen_x86_32
+++ work/buildconfigs/linux-defconfig_xen_x86_32 2006-10-16
12:23:54.000000000 +0900
@@ -183,6 +183,7 @@ CONFIG_MTRR=y
CONFIG_REGPARM=y
CONFIG_SECCOMP=y
CONFIG_HZ_100=y
+CONFIG_KEXEC=y
# CONFIG_HZ_250 is not set
# CONFIG_HZ_1000 is not set
CONFIG_HZ=100
@@ -1036,6 +1037,7 @@ CONFIG_DNOTIFY=y
#
CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
CONFIG_SYSFS=y
CONFIG_TMPFS=y
# CONFIG_HUGETLB_PAGE is not set
--- 0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ work/linux-2.6-xen-sparse/arch/i386/Kconfig 2006-10-16 12:23:54.000000000
+0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
- depends on EXPERIMENTAL && !X86_XEN
+ depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2006-10-16
12:23:54.000000000 +0900
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
obj-y += fixup.o
microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
obj-y := $(call cherrypickxen, $(obj-y))
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c 2006-10-16
12:40:53.000000000 +0900
@@ -69,6 +69,10 @@
#include "setup_arch_pre.h"
#include <bios_ebda.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
/* Forward Declaration. */
void __init find_max_pfn(void);
@@ -943,6 +947,7 @@ static void __init parse_cmdline_early (
* after a kernel panic.
*/
else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
unsigned long size, base;
size = memparse(from+12, &from);
if (*from == '@') {
@@ -953,6 +958,10 @@ static void __init parse_cmdline_early (
crashk_res.start = base;
crashk_res.end = base + size - 1;
}
+#else
+ printk("Ignoring crashkernel command line, "
+ "parameter will be supplied by xen\n");
+#endif
}
#endif
#ifdef CONFIG_PROC_VMCORE
@@ -1322,9 +1331,22 @@ void __init setup_bootmem_allocator(void
}
#endif
#ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
if (crashk_res.start != crashk_res.end)
reserve_bootmem(crashk_res.start,
crashk_res.end - crashk_res.start + 1);
+#else
+ {
+ xen_kexec_reserve_t reservation;
+ BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+ &reservation));
+ if (reservation.size) {
+ crashk_res.start = reservation.start;
+ crashk_res.end = reservation.start +
+ reservation.size - 1;
+ }
+ }
+#endif
#endif
if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1389,7 +1411,8 @@ legacy_init_iomem_resources(struct e820e
request_resource(res, data_resource);
#endif
#ifdef CONFIG_KEXEC
- request_resource(res, &crashk_res);
+ if (crashk_res.start != crashk_res.end)
+ request_resource(res, &crashk_res);
#endif
}
}
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h 2006-10-16
12:23:55.000000000 +0900
@@ -0,0 +1,57 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+ struct cpu_user_regs *xen_regs)
+{
+ xen_regs->ebx = linux_regs->ebx;
+ xen_regs->ecx = linux_regs->ecx;
+ xen_regs->edx = linux_regs->edx;
+ xen_regs->esi = linux_regs->esi;
+ xen_regs->edi = linux_regs->edi;
+ xen_regs->ebp = linux_regs->ebp;
+ xen_regs->eax = linux_regs->eax;
+ xen_regs->esp = linux_regs->esp;
+ xen_regs->ss = linux_regs->xss;
+ xen_regs->cs = linux_regs->xcs;
+ xen_regs->ds = linux_regs->xds;
+ xen_regs->es = linux_regs->xes;
+ xen_regs->eflags = linux_regs->eflags;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page) page_to_pfn(page)
+#define kexec_pfn_to_page(pfn) pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ * c-file-style: "linux"
+ * indent-tabs-mode: t
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
2006-10-16 12:23:54.000000000 +0900
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
return _hypercall2(int, xenoprof_op, op, arg);
}
+static inline int
+HYPERVISOR_kexec(
+ unsigned long op, unsigned int arg1, void * extra_args)
+{
+ return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
#endif /* __HYPERCALL_H__ */
--- /dev/null
+++
work/patches/linux-2.6.16.29/git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,401 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+X-Git-Url:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9
+
+[PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+
+kexec: Avoid overwriting the current pgd (V4, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/i386/kernel/machine_kexec.c
++++ b/arch/i386/kernel/machine_kexec.c
+@@ -21,70 +21,13 @@
+ #include <asm/system.h>
+
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
+-
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+- unsigned long level1_index, level2_index;
+- u32 *pgtable_level2;
+-
+- /* Find the current page table */
+- pgtable_level2 = __va(read_cr3());
+-
+- /* Find the indexes of the physical address to identity map */
+- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+- level2_index = address / LEVEL1_SIZE;
+-
+- /* Identity map the page table entry */
+- pgtable_level1[level1_index] = address | L0_ATTR;
+- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+- /* Flush the tlb so the new mapping takes effect.
+- * Global tlb entries are not flushed but that is not an issue.
+- */
+- load_cr3(pgtable_level2);
+-}
+-
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+- unsigned long level1_index, level2_index, level3_index;
+- u64 *pgtable_level3;
+-
+- /* Find the current page table */
+- pgtable_level3 = __va(read_cr3());
+-
+- /* Find the indexes of the physical address to identity map */
+- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+- level3_index = address / LEVEL2_SIZE;
+-
+- /* Identity map the page table entry */
+- pgtable_level1[level1_index] = address | L0_ATTR;
+- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+- set_64bit(&pgtable_level3[level3_index],
+- __pa(pgtable_level2) | L2_ATTR);
+-
+- /* Flush the tlb so the new mapping takes effect.
+- * Global tlb entries are not flushed but that is not an issue.
+- */
+- load_cr3(pgtable_level3);
+-}
++static u32 kexec_pgd[1024] PAGE_ALIGNED;
++#ifdef CONFIG_X86_PAE
++static u32 kexec_pmd0[1024] PAGE_ALIGNED;
++static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+ #endif
++static u32 kexec_pte0[1024] PAGE_ALIGNED;
++static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
+ static void set_idt(void *newidt, __u16 limit)
+ {
+@@ -128,16 +71,6 @@ static void load_segments(void)
+ #undef __STR
+ }
+
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+- unsigned long indirection_page,
+- unsigned long reboot_code_buffer,
+- unsigned long start_address,
+- unsigned int has_pae) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-extern const unsigned int relocate_new_kernel_size;
+-
+ /*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+@@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage
+ */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+- unsigned long page_list;
+- unsigned long reboot_code_buffer;
+-
+- relocate_new_kernel_t rnk;
++ unsigned long page_list[PAGES_NR];
++ void *control_page;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+- /* Compute some offsets */
+- reboot_code_buffer = page_to_pfn(image->control_code_page)
+- << PAGE_SHIFT;
+- page_list = image->head;
+-
+- /* Set up an identity mapping for the reboot_code_buffer */
+- identity_map_page(reboot_code_buffer);
+-
+- /* copy it out */
+- memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+- relocate_new_kernel_size);
++ control_page = page_address(image->control_code_page);
++ memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++ page_list[PA_CONTROL_PAGE] = __pa(control_page);
++ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++ page_list[PA_PGD] = __pa(kexec_pgd);
++ page_list[VA_PGD] = (unsigned long)kexec_pgd;
++#ifdef CONFIG_X86_PAE
++ page_list[PA_PMD_0] = __pa(kexec_pmd0);
++ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++ page_list[PA_PMD_1] = __pa(kexec_pmd1);
++ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++#endif
++ page_list[PA_PTE_0] = __pa(kexec_pte0);
++ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++ page_list[PA_PTE_1] = __pa(kexec_pte1);
++ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+ /* The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+@@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kim
+ set_idt(phys_to_virt(0),0);
+
+ /* now call it */
+- rnk = (relocate_new_kernel_t) reboot_code_buffer;
+- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++ image->start, cpu_has_pae);
+ }
+
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/i386/kernel/relocate_kernel.S
++++ b/arch/i386/kernel/relocate_kernel.S
+@@ -7,16 +7,138 @@
+ */
+
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
++
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 2)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
++
++ .text
++ .align PAGE_ALIGNED
++ .globl relocate_kernel
++relocate_kernel:
++ movl 8(%esp), %ebp /* list of pages */
++
++#ifdef CONFIG_X86_PAE
++ /* map the control page at its virtual address */
++
++ movl PTR(VA_PGD)(%ebp), %edi
++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
++ andl $0xc0000000, %eax
++ shrl $27, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PMD_0)(%ebp), %edx
++ orl $PAE_PGD_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PMD_0)(%ebp), %edi
++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x3fe00000, %eax
++ shrl $18, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PTE_0)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PTE_0)(%ebp), %edi
++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x001ff000, %eax
++ shrl $9, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ /* identity map the control page at its physical address */
++
++ movl PTR(VA_PGD)(%ebp), %edi
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
++ andl $0xc0000000, %eax
++ shrl $27, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PMD_1)(%ebp), %edx
++ orl $PAE_PGD_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PMD_1)(%ebp), %edi
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x3fe00000, %eax
++ shrl $18, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PTE_1)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PTE_1)(%ebp), %edi
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x001ff000, %eax
++ shrl $9, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++#else
++ /* map the control page at its virtual address */
++
++ movl PTR(VA_PGD)(%ebp), %edi
++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
++ andl $0xffc00000, %eax
++ shrl $20, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PTE_0)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PTE_0)(%ebp), %edi
++ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x003ff000, %eax
++ shrl $10, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ /* identity map the control page at its physical address */
++
++ movl PTR(VA_PGD)(%ebp), %edi
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
++ andl $0xffc00000, %eax
++ shrl $20, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_PTE_1)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++
++ movl PTR(VA_PTE_1)(%ebp), %edi
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
++ andl $0x003ff000, %eax
++ shrl $10, %eax
++ addl %edi, %eax
++
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
++ orl $PAGE_ATTR, %edx
++ movl %edx, (%eax)
++#endif
+
+- /*
+- * Must be relocatable PIC code callable as a C function, that once
+- * it starts can not use the previous processes stack.
+- */
+- .globl relocate_new_kernel
+ relocate_new_kernel:
+ /* read the arguments and say goodbye to the stack */
+ movl 4(%esp), %ebx /* page_list */
+- movl 8(%esp), %ebp /* reboot_code_buffer */
++ movl 8(%esp), %ebp /* list of pages */
+ movl 12(%esp), %edx /* start address */
+ movl 16(%esp), %ecx /* cpu_has_pae */
+
+@@ -24,11 +146,26 @@ relocate_new_kernel:
+ pushl $0
+ popfl
+
+- /* set a new stack at the bottom of our page... */
+- lea 4096(%ebp), %esp
++ /* get physical address of control page now */
++ /* this is impossible after page table switch */
++ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
++
++ /* switch to new set of page tables */
++ movl PTR(PA_PGD)(%ebp), %eax
++ movl %eax, %cr3
++
++ /* setup a new stack at the end of the physical control page */
++ lea 4096(%edi), %esp
+
+- /* store the parameters back on the stack */
+- pushl %edx /* store the start address */
++ /* jump to identity mapped page */
++ movl %edi, %eax
++ addl $(identity_mapped - relocate_kernel), %eax
++ pushl %eax
++ ret
++
++identity_mapped:
++ /* store the start address on the stack */
++ pushl %edx
+
+ /* Set cr0 to a known state:
+ * 31 0 == Paging disabled
+@@ -113,8 +250,3 @@ relocate_new_kernel:
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ ret
+-relocate_new_kernel_end:
+-
+- .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+- .long relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-i386/kexec.h
++++ b/include/asm-i386/kexec.h
+@@ -1,6 +1,26 @@
+ #ifndef _I386_KEXEC_H
+ #define _I386_KEXEC_H
+
++#define PA_CONTROL_PAGE 0
++#define VA_CONTROL_PAGE 1
++#define PA_PGD 2
++#define VA_PGD 3
++#define PA_PTE_0 4
++#define VA_PTE_0 5
++#define PA_PTE_1 6
++#define VA_PTE_1 7
++#ifdef CONFIG_X86_PAE
++#define PA_PMD_0 8
++#define VA_PMD_0 9
++#define PA_PMD_1 10
++#define VA_PMD_1 11
++#define PAGES_NR 12
++#else
++#define PAGES_NR 8
++#endif
++
++#ifndef __ASSEMBLY__
++
+ #include <asm/fixmap.h>
+ #include <asm/ptrace.h>
+ #include <asm/string.h>
+@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru
+ newregs->eip = (unsigned long)current_text_addr();
+ }
+ }
++asmlinkage NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++ unsigned long control_page,
++ unsigned long start_address,
++ unsigned int has_pae) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
+
+ #endif /* _I386_KEXEC_H */
--- /dev/null
+++
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,169 @@
+kexec: Move asm segment handling code to the assembly file (i386)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c | 59 -----------------------------------------------------
+ relocate_kernel.S | 58 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 53 insertions(+), 64 deletions(-)
+
+--- 0002/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c 2006-10-05 15:49:08.000000000
+0900
+@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
+-static void set_idt(void *newidt, __u16 limit)
+-{
+- struct Xgt_desc_struct curidt;
+-
+- /* ia32 supports unaliged loads & stores */
+- curidt.size = limit;
+- curidt.address = (unsigned long)newidt;
+-
+- load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
+-{
+- struct Xgt_desc_struct curgdt;
+-
+- /* ia32 supports unaligned loads & stores */
+- curgdt.size = limit;
+- curgdt.address = (unsigned long)newgdt;
+-
+- load_gdt(&curgdt);
+-};
+-
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
+-
+- __asm__ __volatile__ (
+- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+- "\t1:\n"
+- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+- "\tmovl %%eax,%%ds\n"
+- "\tmovl %%eax,%%es\n"
+- "\tmovl %%eax,%%fs\n"
+- "\tmovl %%eax,%%gs\n"
+- "\tmovl %%eax,%%ss\n"
+- ::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
+-
+ /*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+- /* The segment registers are funny things, they have both a
+- * visible and an invisible part. Whenever the visible part is
+- * set to a specific selector, the invisible part is loaded
+- * with from a table in memory. At no other time is the
+- * descriptor table in memory accessed.
+- *
+- * I take advantage of this here by force loading the
+- * segments, before I zap the gdt with an invalid value.
+- */
+- load_segments();
+- /* The gdt & idt are now invalid.
+- * If you want to load them you must set up your own idt & gdt.
+- */
+- set_gdt(phys_to_virt(0),0);
+- set_idt(phys_to_virt(0),0);
+-
+- /* now call it */
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start, cpu_has_pae);
+ }
+--- 0002/arch/i386/kernel/relocate_kernel.S
++++ work/arch/i386/kernel/relocate_kernel.S 2006-10-05 16:03:21.000000000
+0900
+@@ -154,14 +154,45 @@ relocate_new_kernel:
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
++ /* setup idt */
++ movl %edi, %eax
++ addl $(idt_48 - relocate_kernel), %eax
++ lidtl (%eax)
++
++ /* setup gdt */
++ movl %edi, %eax
++ addl $(gdt - relocate_kernel), %eax
++ movl %edi, %esi
++ addl $((gdt_48 - relocate_kernel) + 2), %esi
++ movl %eax, (%esi)
++
++ movl %edi, %eax
++ addl $(gdt_48 - relocate_kernel), %eax
++ lgdtl (%eax)
++
++ /* setup data segment registers */
++ mov $(gdt_ds - gdt), %eax
++ mov %eax, %ds
++ mov %eax, %es
++ mov %eax, %fs
++ mov %eax, %gs
++ mov %eax, %ss
++
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%edi), %esp
+
+- /* jump to identity mapped page */
+- movl %edi, %eax
+- addl $(identity_mapped - relocate_kernel), %eax
+- pushl %eax
+- ret
++ /* load new code segment and jump to identity mapped page */
++ movl %edi, %esi
++ xorl %eax, %eax
++ pushl %eax
++ pushl %esi
++ pushl %eax
++ movl $(gdt_cs - gdt), %eax
++ pushl %eax
++ movl %edi, %eax
++ addl $(identity_mapped - relocate_kernel),%eax
++ pushl %eax
++ iretl
+
+ identity_mapped:
+ /* store the start address on the stack */
+@@ -250,3 +281,20 @@ identity_mapped:
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ ret
++
++ .align 16
++gdt:
++ .quad 0x0000000000000000 /* NULL descriptor */
++gdt_cs:
++ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
++gdt_ds:
++ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
++gdt_end:
++
++gdt_48:
++ .word gdt_end - gdt - 1 /* limit */
++ .long 0 /* base - filled in by code above */
++
++idt_48:
++ .word 0 /* limit */
++ .long 0 /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch
2006-10-16 12:23:55.000000000 +0900
@@ -0,0 +1,54 @@
+--- 0004/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c 2006-10-11 18:34:06.000000000
+0900
+@@ -20,6 +20,10 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+ static u32 kexec_pgd[1024] PAGE_ALIGNED;
+ #ifdef CONFIG_X86_PAE
+@@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+
++#ifdef CONFIG_XEN
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage
*image)
++{
++ void *control_page;
++
++ memset(xki->page_list, 0, sizeof(xki->page_list));
++
++ control_page = page_address(image->control_code_page);
++ memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++ xki->page_list[PA_PGD] = __ma(kexec_pgd);
++#ifdef CONFIG_X86_PAE
++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++#endif
++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++
++}
++
++#endif /* CONFIG_XEN */
++
+ /*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
--- 0004/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-16 12:23:54.000000000 +0900
@@ -1,6 +1,9 @@
kexec-generic.patch
git-2efe55a9cec8418f0e0cde3dc3787a42fddc4411.patch
git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch
+git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
+linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
+linux-2.6.19-rc1-kexec-xen-i386.patch
blktap-aio-16_03_06.patch
device_bind.patch
fix-hz-suspend.patch
--- 0004/xen/arch/x86/crash.c
+++ work/xen/arch/x86/crash.c 2006-10-16 12:23:54.000000000 +0900
@@ -21,6 +21,7 @@
#include <xen/delay.h>
#include <xen/perfc.h>
#include <xen/kexec.h>
+#include <xen/sched.h>
#include <public/xen.h>
#include <asm/hvm/hvm.h>
@@ -171,6 +172,51 @@ static void nmi_shootdown_cpus(void)
}
#endif
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+ struct domain *d;
+ struct vcpu *v;
+ uint32_t *buf;
+ uint32_t cr3;
+ Elf_Note note;
+
+ /* Don't need to grab domlist_lock as we are the only thing running */
+
+ /* No need to traverse domain_list, as dom0 is always first */
+ d = domain_list;
+ BUG_ON(d->domain_id);
+
+ for_each_vcpu ( d, v ) {
+ if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+ continue;
+ buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+ if (!buf) /* XXX: Can this ever occur? */
+ continue;
+
+ memcpy(¬e, buf, sizeof(Elf_Note));
+ buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+ (note.descsz + 3)/4;
+
+ /* XXX: This probably doesn't take into account shadow mode,
+ * but that might not be a problem */
+ cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+ buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+ NT_XEN_DOM0_CR3, &cr3, 4);
+ final_note(buf);
+
+ printk("domain:%i vcpu:%u processor:%u cr3:%08x\n",
+ d->domain_id, v->vcpu_id, v->processor, cr3);
+ }
+}
+
void machine_crash_shutdown(struct cpu_user_regs *regs)
{
printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -185,6 +231,7 @@ void machine_crash_shutdown(struct cpu_u
hvm_disable();
crash_save_self(regs);
+ find_dom0_cr3();
}
/*
--- 0001/xen/arch/x86/x86_32/entry.S
+++ work/xen/arch/x86/x86_32/entry.S 2006-10-16 12:23:54.000000000 +0900
@@ -672,6 +672,7 @@ ENTRY(hypercall_table)
.long do_hvm_op
.long do_sysctl /* 35 */
.long do_domctl
+ .long do_kexec_op
.rept NR_hypercalls-((.-hypercall_table)/4)
.long do_ni_hypercall
.endr
@@ -714,6 +715,7 @@ ENTRY(hypercall_args_table)
.byte 2 /* do_hvm_op */
.byte 1 /* do_sysctl */ /* 35 */
.byte 1 /* do_domctl */
+ .byte 1 /* do_kexec_op */
.rept NR_hypercalls-(.-hypercall_args_table)
.byte 0 /* do_ni_hypercall */
.endr
--- 0004/xen/arch/x86/x86_32/machine_kexec.c
+++ work/xen/arch/x86/x86_32/machine_kexec.c 2006-10-16 12:23:55.000000000
+0900
@@ -1,18 +1,29 @@
-/*
+/******************************************************************************
* arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@xxxxxxxxxxxx>
+ *
+ * Created By: Horms
*
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
*/
-#include <xen/lib.h> /* for printk() used in stub */
+#include <xen/types.h>
#include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+ unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae);
void machine_kexec(xen_kexec_image_t *image)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+ relocate_new_kernel_t rnk;
+
+ rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_BASE_0);
+ (*rnk)(image->indirection_page, (unsigned long)image->page_list,
+ image->start_address, (unsigned long)cpu_has_pae);
}
/*
--- 0004/xen/include/asm-x86/x86_32/elf.h
+++ work/xen/include/asm-x86/x86_32/elf.h 2006-10-16 12:23:55.000000000
+0900
@@ -3,17 +3,39 @@
*
* Created By: Horms
*
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and
+ * include/asm-i386/system.h from Linux 2.6.16
*/
#ifndef __X86_ELF_X86_32_H__
#define __X86_ELF_X86_32_H__
-#include <xen/lib.h> /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax. For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
+ unsigned i; \
+ pr_reg[0] = regs->ebx; \
+ pr_reg[1] = regs->ecx; \
+ pr_reg[2] = regs->edx; \
+ pr_reg[3] = regs->esi; \
+ pr_reg[4] = regs->edi; \
+ pr_reg[5] = regs->ebp; \
+ pr_reg[6] = regs->eax; \
+ pr_reg[7] = regs->ds; \
+ pr_reg[8] = regs->es; \
+ asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9] = i; \
+ asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i; \
+ pr_reg[11] = 0; /* regs->orig_eax; */ \
+ pr_reg[12] = regs->eip; \
+ pr_reg[13] = regs->cs; \
+ pr_reg[14] = regs->eflags; \
+ pr_reg[15] = regs->esp; \
+ pr_reg[16] = regs->ss; \
+} while(0);
#endif /* __X86_ELF_X86_32_H__ */
--- 0004/xen/include/asm-x86/x86_32/kexec.h
+++ work/xen/include/asm-x86/x86_32/kexec.h 2006-10-16 12:23:55.000000000
+0900
@@ -3,39 +3,72 @@
*
* Created By: Horms
*
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
*/
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h> /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
- struct cpu_user_regs *oldregs)
+ struct cpu_user_regs *oldregs)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
- return;
- crash_fixup_ss_esp(newregs, oldregs);
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ newregs->esp = (unsigned long)&(oldregs->esp);
+ __asm__ __volatile__(
+ "xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->ss));
}
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
static void crash_setup_regs(struct cpu_user_regs *newregs,
struct cpu_user_regs *oldregs)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+ if (oldregs)
+ crash_fixup_ss_esp(newregs, oldregs);
+ else {
+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+ __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+ __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+ __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+ __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+ __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+ __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+ newregs->eip = (unsigned long)current_text_addr();
+ }
}
+/*
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value. This tricky test checks that with
+ * one comparison. Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
static inline int user_mode(struct cpu_user_regs *regs)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
- return -1;
+ return (regs->cs & 2) != 0;
}
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
/*
* Local variables:
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|