[PATCH 04/04] Kexec / Kdump: x86_64 specific code
This patch contains the x86_64 implementation of Kexec / Kdump for Xen.
Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---
Applies on top of xen-unstable-11856.
buildconfigs/linux-defconfig_xen_x86_64 | 1
linux-2.6-xen-sparse/arch/x86_64/Kconfig | 2
linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile | 2
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c | 27
linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h | 64 +
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h | 7
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h | 2
patches/linux-2.6.16.29/git-4b...1f.patch | 375 ++++
patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-x86_64.patch | 161 ++
patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch | 162 ++
patches/linux-2.6.16.29/series | 3
xen/arch/x86/x86_64/entry.S | 2
xen/include/asm-x86/x86_64/elf.h | 49 +
xen/include/asm-x86/x86_64/kexec.h | 60 +
14 files changed, 903 insertions(+), 14 deletions(-)
--- 0002/buildconfigs/linux-defconfig_xen_x86_64
+++ work/buildconfigs/linux-defconfig_xen_x86_64 2006-10-23
11:36:17.000000000 +0900
@@ -138,6 +138,7 @@ CONFIG_SWIOTLB=y
CONFIG_PHYSICAL_START=0x100000
CONFIG_SECCOMP=y
CONFIG_HZ_100=y
+CONFIG_KEXEC=y
# CONFIG_HZ_250 is not set
# CONFIG_HZ_1000 is not set
CONFIG_HZ=100
--- 0001/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ work/linux-2.6-xen-sparse/arch/x86_64/Kconfig 2006-10-23
11:36:17.000000000 +0900
@@ -435,7 +435,7 @@ config X86_MCE_AMD
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
- depends on EXPERIMENTAL && !X86_64_XEN
+ depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile 2006-10-23
11:36:17.000000000 +0900
@@ -59,7 +59,7 @@ pci-dma-y += ../../i386/kernel/pci-dma
microcode-$(subst m,y,$(CONFIG_MICROCODE)) :=
../../i386/kernel/microcode-xen.o
quirks-y := ../../i386/kernel/quirks-xen.o
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
include $(srctree)/scripts/Makefile.xen
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c 2006-10-23
11:36:17.000000000 +0900
@@ -80,6 +80,10 @@
#include <asm/mach-xen/setup_arch_post.h>
#include <xen/interface/memory.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
extern unsigned long start_pfn;
extern struct edid_info edid_info;
@@ -450,6 +454,7 @@ static __init void parse_cmdline_early (
* after a kernel panic.
*/
else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
unsigned long size, base;
size = memparse(from+12, &from);
if (*from == '@') {
@@ -460,6 +465,10 @@ static __init void parse_cmdline_early (
crashk_res.start = base;
crashk_res.end = base + size - 1;
}
+#else
+ printk("Ignoring crashkernel command line, "
+ "parameter will be supplied by xen\n");
+#endif
}
#endif
@@ -812,10 +821,23 @@ void __init setup_arch(char **cmdline_p)
#endif
#endif /* !CONFIG_XEN */
#ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
if (crashk_res.start != crashk_res.end) {
reserve_bootmem(crashk_res.start,
crashk_res.end - crashk_res.start + 1);
}
+#else
+ {
+ xen_kexec_reserve_t reservation;
+ BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve,
+ &reservation));
+ if (reservation.size) {
+ crashk_res.start = reservation.start;
+ crashk_res.end = reservation.start +
+ reservation.size - 1;
+ }
+ }
+#endif
#endif
paging_init();
@@ -954,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
iommu_hole_init();
#endif
+#ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end)
+ request_resource(&ioport_resource, &crashk_res);
+#endif
+
#ifdef CONFIG_XEN
{
struct physdev_set_iopl set_iopl;
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h 2006-10-23
11:36:18.000000000 +0900
@@ -0,0 +1,64 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+ struct cpu_user_regs *xen_regs)
+{
+ xen_regs->r15 = linux_regs->r15;
+ xen_regs->r14 = linux_regs->r14;
+ xen_regs->r13 = linux_regs->r13;
+ xen_regs->r12 = linux_regs->r12;
+ xen_regs->rbp = linux_regs->rbp;
+ xen_regs->rbx = linux_regs->rbx;
+ xen_regs->r11 = linux_regs->r11;
+ xen_regs->r10 = linux_regs->r10;
+ xen_regs->r9 = linux_regs->r9;
+ xen_regs->r8 = linux_regs->r8;
+ xen_regs->rax = linux_regs->rax;
+ xen_regs->rcx = linux_regs->rcx;
+ xen_regs->rdx = linux_regs->rdx;
+ xen_regs->rsi = linux_regs->rsi;
+ xen_regs->rdi = linux_regs->rdi;
+ xen_regs->rip = linux_regs->rip;
+ xen_regs->cs = linux_regs->cs;
+ xen_regs->rflags = linux_regs->eflags;
+ xen_regs->rsp = linux_regs->rsp;
+ xen_regs->ss = linux_regs->ss;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page) page_to_pfn(page)
+#define kexec_pfn_to_page(pfn) pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ * c-file-style: "linux"
+ * indent-tabs-mode: t
+ * c-indent-level: 8
+ * c-basic-offset: 8
+ * tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
2006-10-23 11:36:17.000000000 +0900
@@ -386,4 +386,11 @@ HYPERVISOR_xenoprof_op(
return _hypercall2(int, xenoprof_op, op, arg);
}
+static inline int
+HYPERVISOR_kexec_op(
+ unsigned long op, void *args)
+{
+ return _hypercall2(int, kexec_op, op, args);
+}
+
#endif /* __HYPERCALL_H__ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
2006-10-23 11:36:17.000000000 +0900
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
#define profile_pc(regs) instruction_pointer(regs)
#endif
+#include <linux/compiler.h>
+
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
struct task_struct;
--- /dev/null
+++
work/patches/linux-2.6.16.29/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch
2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,375 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64)
+X-Git-Tag: v2.6.19-rc1
+X-Git-Url:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f
+
+[PATCH] Avoid overwriting the current pgd (V4, x86_64)
+
+kexec: Avoid overwriting the current pgd (V4, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/x86_64/kernel/machine_kexec.c
++++ b/arch/x86_64/kernel/machine_kexec.c
+@@ -15,6 +15,15 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+
++#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
++static u64 kexec_pgd[512] PAGE_ALIGNED;
++static u64 kexec_pud0[512] PAGE_ALIGNED;
++static u64 kexec_pmd0[512] PAGE_ALIGNED;
++static u64 kexec_pte0[512] PAGE_ALIGNED;
++static u64 kexec_pud1[512] PAGE_ALIGNED;
++static u64 kexec_pmd1[512] PAGE_ALIGNED;
++static u64 kexec_pte1[512] PAGE_ALIGNED;
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ unsigned long end_addr;
+@@ -144,32 +153,19 @@ static void load_segments(void)
+ );
+ }
+
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long
indirection_page,
+- unsigned long control_code_buffer,
+- unsigned long start_address,
+- unsigned long pgtable) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern const unsigned long relocate_new_kernel_size;
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+- unsigned long start_pgtable, control_code_buffer;
++ unsigned long start_pgtable;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+- control_code_buffer = start_pgtable + PAGE_SIZE;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+- /* Place the code in the reboot code buffer */
+- memcpy(__va(control_code_buffer), relocate_new_kernel,
+- relocate_new_kernel_size);
+-
+ return 0;
+ }
+
+@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage
+ */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+- unsigned long page_list;
+- unsigned long control_code_buffer;
+- unsigned long start_pgtable;
+- relocate_new_kernel_t rnk;
++ unsigned long page_list[PAGES_NR];
++ void *control_page;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+- /* Calculate the offsets */
+- page_list = image->head;
+- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+- control_code_buffer = start_pgtable + PAGE_SIZE;
++ control_page = page_address(image->control_code_page) + PAGE_SIZE;
++ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+- /* Set the low half of the page table to my identity mapped
+- * page table for kexec. Leave the high half pointing at the
+- * kernel pages. Don't bother to flush the global pages
+- * as that will happen when I fully switch to my identity mapped
+- * page table anyway.
+- */
+- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+- __flush_tlb();
++ page_list[PA_CONTROL_PAGE] = __pa(control_page);
++ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++ page_list[PA_PGD] = __pa(kexec_pgd);
++ page_list[VA_PGD] = (unsigned long)kexec_pgd;
++ page_list[PA_PUD_0] = __pa(kexec_pud0);
++ page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
++ page_list[PA_PMD_0] = __pa(kexec_pmd0);
++ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++ page_list[PA_PTE_0] = __pa(kexec_pte0);
++ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++ page_list[PA_PUD_1] = __pa(kexec_pud1);
++ page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
++ page_list[PA_PMD_1] = __pa(kexec_pmd1);
++ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++ page_list[PA_PTE_1] = __pa(kexec_pte1);
++ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
++ page_list[PA_TABLE_PAGE] =
++ (unsigned long)__pa(page_address(image->control_code_page));
+
+ /* The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+@@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kim
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
++
+ /* now call it */
+- rnk = (relocate_new_kernel_t) control_code_buffer;
+- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++ image->start);
+ }
+
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/x86_64/kernel/relocate_kernel.S
++++ b/arch/x86_64/kernel/relocate_kernel.S
+@@ -7,31 +7,169 @@
+ */
+
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
+
+- /*
+- * Must be relocatable PIC code callable as a C function, that once
+- * it starts can not use the previous processes stack.
+- */
+- .globl relocate_new_kernel
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 3)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++
++ .text
++ .align PAGE_ALIGNED
+ .code64
++ .globl relocate_kernel
++relocate_kernel:
++ /* %rdi indirection_page
++ * %rsi page_list
++ * %rdx start address
++ */
++
++ /* map the control page at its virtual address */
++
++ movq $0x0000ff8000000000, %r10 /* mask */
++ mov $(39 - 3), %cl /* bits to shift */
++ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PGD)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PUD_0)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PUD_0)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PMD_0)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PMD_0)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PTE_0)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PTE_0)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ /* identity map the control page at its physical address */
++
++ movq $0x0000ff8000000000, %r10 /* mask */
++ mov $(39 - 3), %cl /* bits to shift */
++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PGD)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PUD_1)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PUD_1)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PMD_1)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PMD_1)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_PTE_1)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
++ shrq $9, %r10
++ sub $9, %cl
++
++ movq %r11, %r9
++ andq %r10, %r9
++ shrq %cl, %r9
++
++ movq PTR(VA_PTE_1)(%rsi), %r8
++ addq %r8, %r9
++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
++ orq $PAGE_ATTR, %r8
++ movq %r8, (%r9)
++
+ relocate_new_kernel:
+- /* %rdi page_list
+- * %rsi reboot_code_buffer
++ /* %rdi indirection_page
++ * %rsi page_list
+ * %rdx start address
+- * %rcx page_table
+- * %r8 arg5
+- * %r9 arg6
+ */
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+- /* set a new stack at the bottom of our page... */
+- lea 4096(%rsi), %rsp
++ /* get physical address of control page now */
++ /* this is impossible after page table switch */
++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
++
++ /* get physical address of page table now too */
++ movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
++
++ /* switch to new set of page tables */
++ movq PTR(PA_PGD)(%rsi), %r9
++ movq %r9, %cr3
++
++ /* setup a new stack at the end of the physical control page */
++ lea 4096(%r8), %rsp
++
++ /* jump to identity mapped page */
++ addq $(identity_mapped - relocate_kernel), %r8
++ pushq %r8
++ ret
+
+- /* store the parameters back on the stack */
+- pushq %rdx /* store the start address */
++identity_mapped:
++ /* store the start address on the stack */
++ pushq %rdx
+
+ /* Set cr0 to a known state:
+ * 31 1 == Paging enabled
+@@ -136,8 +274,3 @@ relocate_new_kernel:
+ xorq %r15, %r15
+
+ ret
+-relocate_new_kernel_end:
+-
+- .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+- .quad relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-x86_64/kexec.h
++++ b/include/asm-x86_64/kexec.h
+@@ -1,6 +1,27 @@
+ #ifndef _X86_64_KEXEC_H
+ #define _X86_64_KEXEC_H
+
++#define PA_CONTROL_PAGE 0
++#define VA_CONTROL_PAGE 1
++#define PA_PGD 2
++#define VA_PGD 3
++#define PA_PUD_0 4
++#define VA_PUD_0 5
++#define PA_PMD_0 6
++#define VA_PMD_0 7
++#define PA_PTE_0 8
++#define VA_PTE_0 9
++#define PA_PUD_1 10
++#define VA_PUD_1 11
++#define PA_PMD_1 12
++#define VA_PMD_1 13
++#define PA_PTE_1 14
++#define VA_PTE_1 15
++#define PA_TABLE_PAGE 16
++#define PAGES_NR 17
++
++#ifndef __ASSEMBLY__
++
+ #include <linux/string.h>
+
+ #include <asm/page.h>
+@@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru
+ newregs->rip = (unsigned long)current_text_addr();
+ }
+ }
++
++NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++ unsigned long page_list,
++ unsigned long start_address) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
++
+ #endif /* _X86_64_KEXEC_H */
--- /dev/null
+++
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,161 @@
+kexec: Move asm segment handling code to the assembly file (x86_64)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c | 58 -----------------------------------------------------
+ relocate_kernel.S | 50 +++++++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 45 insertions(+), 63 deletions(-)
+
+--- 0002/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c 2006-10-05 16:15:49.000000000
+0900
+@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ }
+
+-static void set_idt(void *newidt, u16 limit)
+-{
+- struct desc_ptr curidt;
+-
+- /* x86-64 supports unaliged loads & stores */
+- curidt.size = limit;
+- curidt.address = (unsigned long)newidt;
+-
+- __asm__ __volatile__ (
+- "lidtq %0\n"
+- : : "m" (curidt)
+- );
+-};
+-
+-
+-static void set_gdt(void *newgdt, u16 limit)
+-{
+- struct desc_ptr curgdt;
+-
+- /* x86-64 supports unaligned loads & stores */
+- curgdt.size = limit;
+- curgdt.address = (unsigned long)newgdt;
+-
+- __asm__ __volatile__ (
+- "lgdtq %0\n"
+- : : "m" (curgdt)
+- );
+-};
+-
+-static void load_segments(void)
+-{
+- __asm__ __volatile__ (
+- "\tmovl %0,%%ds\n"
+- "\tmovl %0,%%es\n"
+- "\tmovl %0,%%ss\n"
+- "\tmovl %0,%%fs\n"
+- "\tmovl %0,%%gs\n"
+- : : "a" (__KERNEL_DS) : "memory"
+- );
+-}
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+ unsigned long start_pgtable;
+@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
+
+- /* The segment registers are funny things, they have both a
+- * visible and an invisible part. Whenever the visible part is
+- * set to a specific selector, the invisible part is loaded
+- * with from a table in memory. At no other time is the
+- * descriptor table in memory accessed.
+- *
+- * I take advantage of this here by force loading the
+- * segments, before I zap the gdt with an invalid value.
+- */
+- load_segments();
+- /* The gdt & idt are now invalid.
+- * If you want to load them you must set up your own idt & gdt.
+- */
+- set_gdt(phys_to_virt(0),0);
+- set_idt(phys_to_virt(0),0);
+-
+- /* now call it */
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start);
+ }
+--- 0002/arch/x86_64/kernel/relocate_kernel.S
++++ work/arch/x86_64/kernel/relocate_kernel.S 2006-10-05 16:18:07.000000000
+0900
+@@ -159,13 +159,39 @@ relocate_new_kernel:
+ movq PTR(PA_PGD)(%rsi), %r9
+ movq %r9, %cr3
+
++ /* setup idt */
++ movq %r8, %rax
++ addq $(idt_80 - relocate_kernel), %rax
++ lidtq (%rax)
++
++ /* setup gdt */
++ movq %r8, %rax
++ addq $(gdt - relocate_kernel), %rax
++ movq %r8, %r9
++ addq $((gdt_80 - relocate_kernel) + 2), %r9
++ movq %rax, (%r9)
++
++ movq %r8, %rax
++ addq $(gdt_80 - relocate_kernel), %rax
++ lgdtq (%rax)
++
++ /* setup data segment registers */
++ xorl %eax, %eax
++ movl %eax, %ds
++ movl %eax, %es
++ movl %eax, %fs
++ movl %eax, %gs
++ movl %eax, %ss
++
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%r8), %rsp
+
+- /* jump to identity mapped page */
+- addq $(identity_mapped - relocate_kernel), %r8
+- pushq %r8
+- ret
++ /* load new code segment and jump to identity mapped page */
++ movq %r8, %rax
++ addq $(identity_mapped - relocate_kernel), %rax
++ pushq $(gdt_cs - gdt)
++ pushq %rax
++ lretq
+
+ identity_mapped:
+ /* store the start address on the stack */
+@@ -272,5 +298,19 @@ identity_mapped:
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+-
+ ret
++
++ .align 16
++gdt:
++ .quad 0x0000000000000000 /* NULL descriptor */
++gdt_cs:
++ .quad 0x00af9a000000ffff
++gdt_end:
++
++gdt_80:
++ .word gdt_end - gdt - 1 /* limit */
++ .quad 0 /* base - filled in by code above */
++
++idt_80:
++ .word 0 /* limit */
++ .quad 0 /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch
2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,162 @@
+--- 0006/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c 2006-10-06 15:36:16.000000000
+0900
+@@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED;
+ static u64 kexec_pmd1[512] PAGE_ALIGNED;
+ static u64 kexec_pte1[512] PAGE_ALIGNED;
+
++#ifdef CONFIG_XEN
++
++/* In the case of Xen, override hypervisor functions to be able to create
++ * a regular identity mapping page table...
++ */
++
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x) ((x).pmd)
++#define x_pud_val(x) ((x).pud)
++#define x_pgd_val(x) ((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++ x_pmd_val(*dst) = x_pmd_val(val);
++}
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++ x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++ x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
++}
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++ x_pgd_val(*pgd) = 0;
++}
++
++#define X__PAGE_KERNEL_LARGE_EXEC \
++ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage
*image)
++{
++ void *control_page;
++ void *table_page;
++
++ memset(xki->page_list, 0, sizeof(xki->page_list));
++
++ control_page = page_address(image->control_code_page) + PAGE_SIZE;
++ memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++ table_page = page_address(image->control_code_page);
++
++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++ xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
++
++ xki->page_list[PA_PGD] = __ma(kexec_pgd);
++ xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
++ xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++}
++
++#else /* CONFIG_XEN */
++
++#define x__pmd(x) __pmd(x)
++#define x__pud(x) __pud(x)
++#define x__pgd(x) __pgd(x)
++
++#define x_set_pmd(x, y) set_pmd(x, y)
++#define x_set_pud(x, y) set_pud(x, y)
++#define x_set_pgd(x, y) set_pgd(x, y)
++
++#define x_pud_clear(x) pud_clear(x)
++#define x_pgd_clear(x) pgd_clear(x)
++
++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define X_KERNPG_TABLE _KERNPG_TABLE
++
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ unsigned long end_addr;
+@@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve
+ addr &= PAGE_MASK;
+ end_addr = addr + PUD_SIZE;
+ while (addr < end_addr) {
+- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++ x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
+ addr += PMD_SIZE;
+ }
+ }
+@@ -56,12 +154,12 @@ static int init_level3_page(struct kimag
+ }
+ level2p = (pmd_t *)page_address(page);
+ init_level2_page(level2p, addr);
+- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++ x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
+ addr += PUD_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+- pud_clear(level3p++);
++ x_pud_clear(level3p++);
+ addr += PUD_SIZE;
+ }
+ out:
+@@ -92,12 +190,12 @@ static int init_level4_page(struct kimag
+ if (result) {
+ goto out;
+ }
+- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++ x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
+ addr += PGDIR_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+- pgd_clear(level4p++);
++ x_pgd_clear(level4p++);
+ addr += PGDIR_SIZE;
+ }
+ out:
+@@ -108,8 +206,14 @@ out:
+ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+ {
+ pgd_t *level4p;
++ unsigned long x_end_pfn = end_pfn;
++
++#ifdef CONFIG_XEN
++ x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#endif
++
+ level4p = (pgd_t *)__va(start_pgtable);
+- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
++ return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
+ }
+
+ int machine_kexec_prepare(struct kimage *image)
--- 0005/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-23 11:36:17.000000000 +0900
@@ -4,6 +4,9 @@ git-2a8a3d5b65e86ec1dfef7d268c64a909eab9
git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
linux-2.6.19-rc1-kexec-xen-i386.patch
+git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch
+linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
+linux-2.6.19-rc1-kexec-xen-x86_64.patch
blktap-aio-16_03_06.patch
device_bind.patch
fix-hz-suspend.patch
--- 0001/xen/arch/x86/x86_64/entry.S
+++ work/xen/arch/x86/x86_64/entry.S 2006-10-23 11:36:17.000000000 +0900
@@ -573,6 +573,7 @@ ENTRY(hypercall_table)
.quad do_hvm_op
.quad do_sysctl /* 35 */
.quad do_domctl
+ .quad do_kexec_op
.rept NR_hypercalls-((.-hypercall_table)/8)
.quad do_ni_hypercall
.endr
@@ -615,6 +616,7 @@ ENTRY(hypercall_args_table)
.byte 2 /* do_hvm_op */
.byte 1 /* do_sysctl */ /* 35 */
.byte 1 /* do_domctl */
+ .byte 2 /* do_kexec */
.rept NR_hypercalls-(.-hypercall_args_table)
.byte 0 /* do_ni_hypercall */
.endr
--- 0004/xen/include/asm-x86/x86_64/elf.h
+++ work/xen/include/asm-x86/x86_64/elf.h 2006-10-23 11:36:17.000000000
+0900
@@ -1,14 +1,51 @@
+/*
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
+ */
+
#ifndef __X86_64_ELF_H__
#define __X86_64_ELF_H__
-#include <xen/lib.h> /* for printk() used in stub */
+#define ELF_NGREG 27
-#define ELF_NGREG 1 /* XXX: Define to be at least as large as
- however many register slots are needed when
- crash notes are written during crash dump */
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ * Xen dosn't have threads, so fs and gs are read from the CPU and
+ * thus values 21 and 22 are just duplicates of 25 and 26
+ * respectively. All these values could be passed from dom0 in the
+ * case of it crashing, but does that help?
+ *
+ * Lastly, I'm not sure why ds, es, fs and gs are read from
+ * the CPU rather than regs, but linux does this
+ */
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
+ unsigned v; \
+ (pr_reg)[0] = (regs)->r15; \
+ (pr_reg)[1] = (regs)->r14; \
+ (pr_reg)[2] = (regs)->r13; \
+ (pr_reg)[3] = (regs)->r12; \
+ (pr_reg)[4] = (regs)->rbp; \
+ (pr_reg)[5] = (regs)->rbx; \
+ (pr_reg)[6] = (regs)->r11; \
+ (pr_reg)[7] = (regs)->r10; \
+ (pr_reg)[8] = (regs)->r9; \
+ (pr_reg)[9] = (regs)->r8; \
+ (pr_reg)[10] = (regs)->rax; \
+ (pr_reg)[11] = (regs)->rcx; \
+ (pr_reg)[12] = (regs)->rdx; \
+ (pr_reg)[13] = (regs)->rsi; \
+ (pr_reg)[14] = (regs)->rdi; \
+ (pr_reg)[16] = (regs)->rip; \
+ (pr_reg)[17] = (regs)->cs; \
+ (pr_reg)[18] = (regs)->eflags; \
+ (pr_reg)[19] = (regs)->rsp; \
+ (pr_reg)[20] = (regs)->ss; \
+ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v; \
+ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v; \
+ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
+ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
+ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
+ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
+} while(0);
#endif /* __X86_64_ELF_H__ */
--- 0004/xen/include/asm-x86/x86_64/kexec.h
+++ work/xen/include/asm-x86/x86_64/kexec.h 2006-10-23 11:36:17.000000000
+0900
@@ -1,20 +1,68 @@
+/******************************************************************************
+ * kexec.h
+ *
+ * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1
+ *
+ */
+
#ifndef __X86_64_KEXEC_H__
#define __X86_64_KEXEC_H__
-
-#include <xen/lib.h> /* for printk() used in stub */
+
+#include <xen/lib.h>
#include <xen/types.h>
#include <public/xen.h>
#include <xen/kexec.h>
-
+#include <asm/processor.h>
+#include <xen/string.h>
+#include <asm/fixmap.h>
+
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
static inline void crash_setup_regs(struct cpu_user_regs *newregs,
- struct cpu_user_regs *oldregs)
+ struct cpu_user_regs *oldregs)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+ if (oldregs)
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ else {
+ __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+ __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+ __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+ __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+ __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+ __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+ __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+ __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+ __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+ __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+ __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+ __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+ __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+ __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+ __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+ __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+ __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+ newregs->rip = (unsigned long)current_text_addr();
+ }
}
+typedef void (*relocate_new_kernel_t)(
+ unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address);
+
static inline void machine_kexec(xen_kexec_image_t *image)
{
- printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+ relocate_new_kernel_t rnk;
+
+ rnk = (relocate_new_kernel_t) image->page_list[1];
+ (*rnk)(image->indirection_page, (unsigned long)image->page_list,
+ image->start_address);
}
#endif /* __X86_64_KEXEC_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|