kexec: x86_64 This is the first x86_64 release of kexec for xen/dom0. The code is in an early phase, but it compiles and kexec:ing into a Linux kernel seems to work well. Rebooting into a new kernel may work using kdump too, but register saving support is still missing. The x86 component is a prerequsite for this patch. Signed-Off-By: Magnus Damm Signed-Off-By: Horms buildconfigs/linux-defconfig_xen_x86_64 | 1 linux-2.6-xen-sparse/arch/x86_64/Kconfig | 2 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile | 6 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c | 26 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h | 30 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h | 7 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h | 2 patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch | 421 ++++++++++ patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch | 151 +++ xen/arch/x86/x86_64/entry.S | 2 xen/arch/x86/x86_64/machine_kexec.c | 21 xen/include/asm-x86/x86_64/elf.h | 48 + xen/include/asm-x86/x86_64/kexec.h | 33 13 files changed, 736 insertions(+), 14 deletions(-) --- x/buildconfigs/linux-defconfig_xen_x86_64 +++ x/buildconfigs/linux-defconfig_xen_x86_64 @@ -139,6 +139,7 @@ CONFIG_SWIOTLB=y CONFIG_PHYSICAL_START=0x100000 CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 --- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig +++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig @@ -433,7 +433,7 @@ config X86_MCE_AMD config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_64_XEN + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot --- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile +++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile @@ -23,7 +23,11 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o obj-$(CONFIG_X86_XEN_GENAPIC) += genapic.o genapic_xen.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o +ifdef CONFIG_XEN +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o +else obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o +endif obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj-$(CONFIG_ACPI_SLEEP) += suspend.o @@ -59,7 +63,7 @@ pci-dma-y += ../../i386/kernel/pci-dma microcode-$(subst m,y,$(CONFIG_MICROCODE)) := ../../i386/kernel/microcode-xen.o quirks-y := ../../i386/kernel/quirks-xen.o -n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o include $(srctree)/scripts/Makefile.xen --- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c +++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c @@ -79,6 +79,10 @@ #include #include +#ifdef CONFIG_XEN +#include +#endif + extern unsigned long start_pfn; extern struct edid_info edid_info; @@ -446,6 +450,7 @@ static __init void parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -456,6 +461,10 @@ static __init void parse_cmdline_early ( crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif @@ -801,10 +810,23 @@ void __init setup_arch(char **cmdline_p) #endif #endif /* !CONFIG_XEN */ #ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN if (crashk_res.start != crashk_res.end) { reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); } +#else + { + xen_kexec_reserve_t reservation; + BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0, + &reservation)); + if (reservation.size) { + crashk_res.start = reservation.start; + crashk_res.end = reservation.start + + reservation.size - 1; + } + } +#endif #endif paging_init(); @@ -950,6 +972,10 @@ void __init setup_arch(char **cmdline_p) iommu_hole_init(); #endif +#ifdef CONFIG_KEXEC + request_resource(&ioport_resource, &crashk_res); +#endif + #ifdef CONFIG_XEN { struct physdev_set_iopl set_iopl; --- /dev/null +++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h @@ -0,0 +1,30 @@ +/* + * include/asm-x86_64/kexec-xen.h + * + * Created By: Horms + */ + +#ifndef _X86_64_KEXEC_XEN_H +#define _X86_64_KEXEC_XEN_H + +#include +#include +#include + +static inline void crash_translate_regs(struct pt_regs *linux_regs, + struct cpu_user_regs *xen_regs) +{ +#warning Implement me! +} + +#endif /* _X86_64_KEXEC_XEN_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ --- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h +++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h @@ -369,4 +369,11 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec( + unsigned long op, unsigned int arg1, void * extra_args) +{ + return _hypercall3(int, kexec_op, op, arg1, extra_args); +} + #endif /* __HYPERCALL_H__ */ --- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h +++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h @@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p #define profile_pc(regs) instruction_pointer(regs) #endif +#include + void signal_fault(struct pt_regs *regs, void __user *frame, char *where); struct task_struct; --- x/xen/arch/x86/x86_64/entry.S +++ x/xen/arch/x86/x86_64/entry.S @@ -566,6 +566,7 @@ ENTRY(hypercall_table) .quad do_xenoprof_op .quad do_event_channel_op .quad do_physdev_op + .quad do_kexec .rept NR_hypercalls-((.-hypercall_table)/8) .quad do_ni_hypercall .endr @@ -605,6 +606,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_xenoprof_op */ .byte 2 /* do_event_channel_op */ .byte 2 /* do_physdev_op */ + .byte 3 /* do_kexec */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr --- x/xen/arch/x86/x86_64/machine_kexec.c +++ x/xen/arch/x86/x86_64/machine_kexec.c @@ -4,18 +4,29 @@ * * Created By: Horms * - * Should be losely based on arch/x86_64/kernel/machine_kexec.c + * Losely based on arch/x86_64/kernel/machine_kexec.c */ - -#include /* for printk() used in stub */ + #include #include +#include + +typedef void (*relocate_new_kernel_t)(unsigned long indirection_page, + unsigned long control_code_buffer, + unsigned long start_address, + unsigned long page_table_a, + unsigned long page_table_b); void machine_kexec(xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); -} + relocate_new_kernel_t rnk; + rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE); + (*rnk)(image->indirection_page, image->reboot_code_buffer, + image->start_address, image->page_table_a[0], + image->page_table_b); + } + /* * Local variables: * mode: C --- x/xen/include/asm-x86/x86_64/elf.h +++ x/xen/include/asm-x86/x86_64/elf.h @@ -3,17 +3,55 @@ * * Created By: Horms * - * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS - * from Linux 2.6.16 + * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16 */ #ifndef __X86_ELF_X86_64_H__ #define __X86_ELF_X86_64_H__ -#include /* for printk() used in stub */ +#include -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +#include + +/* XXX: Xen doesn't have orig_rax, so it is omitted. + * Xen dosn't have threads, so fs and gs are read from the CPU and + * thus values 21 and 22 are just duplicates of 25 and 26 + * respectively. All these values could be passed from dom0 in the + * case of it crashing, but does that help? + * + * Lastly, I'm not sure why ds, es, fs and gs are read from + * the CPU rather than regs, but linux does this + */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + unsigned v; \ + (pr_reg)[0] = (regs)->r15; \ + (pr_reg)[1] = (regs)->r14; \ + (pr_reg)[2] = (regs)->r13; \ + (pr_reg)[3] = (regs)->r12; \ + (pr_reg)[4] = (regs)->rbp; \ + (pr_reg)[5] = (regs)->rbx; \ + (pr_reg)[6] = (regs)->r11; \ + (pr_reg)[7] = (regs)->r10; \ + (pr_reg)[8] = (regs)->r9; \ + (pr_reg)[9] = (regs)->r8; \ + (pr_reg)[10] = (regs)->rax; \ + (pr_reg)[11] = (regs)->rcx; \ + (pr_reg)[12] = (regs)->rdx; \ + (pr_reg)[13] = (regs)->rsi; \ + (pr_reg)[14] = (regs)->rdi; \ + (pr_reg)[16] = (regs)->rip; \ + (pr_reg)[17] = (regs)->cs; \ + (pr_reg)[18] = (regs)->eflags; \ + (pr_reg)[19] = (regs)->rsp; \ + (pr_reg)[20] = (regs)->ss; \ + asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v; \ + asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v; \ + asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ + asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ + asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ + asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \ +} while(0); #endif /* __X86_ELF_X86_64_H__ */ --- x/xen/include/asm-x86/x86_64/kexec.h +++ x/xen/include/asm-x86/x86_64/kexec.h @@ -10,14 +10,43 @@ #ifndef __X86_64_KEXEC_H__ #define __X86_64_KEXEC_H__ -#include /* for printk() used in stub */ +#include #include #include +/* + * Saving the registers of the cpu on which panic occured in + * crash_kexec to save a valid sp. The registers of other cpus + * will be saved in machine_crash_shutdown while shooting down them. + */ static void crash_setup_regs(struct cpu_user_regs *newregs, struct cpu_user_regs *oldregs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + if (oldregs) + memcpy(newregs, oldregs, sizeof(*newregs)); + else { + __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx)); + __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx)); + __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx)); + __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi)); + __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi)); + __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp)); + __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax)); + __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp)); + __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8)); + __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9)); + __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10)); + __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11)); + __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12)); + __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13)); + __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14)); + __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15)); + __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs)); + __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags)); + + newregs->rip = (unsigned long)current_text_addr(); + } } #endif /* __X86_64_KEXEC_H__ */ --- /dev/null +++ x/patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch @@ -0,0 +1,421 @@ +kexec: Avoid overwriting the current pgd (V2, x86_64) + +This patch upgrades the x86_64-specific kexec code to avoid overwriting the +current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used +to start a secondary kernel that dumps the memory of the previous kernel. + +The code introduces a new set of page tables called "page_table_a". These +tables are used to provide an executable identity mapping without overwriting +the current pgd. The already existing page table is renamed to "page_table_b". + +KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of +the patch also moves the segment handling code into the reloacte_kernel.S. + +Signed-off-by: Magnus Damm +--- + + The patch has been tested with regular kexec and CONFIG_CRASH_DUMP. + Applies on top of 2.6.16 and 2.6.17-rc4. + + arch/x86_64/kernel/machine_kexec.c | 193 +++++++++++++++++----------------- + arch/x86_64/kernel/relocate_kernel.S | 84 +++++++++++++- + include/asm-x86_64/kexec.h | 15 ++ + 3 files changed, 189 insertions(+), 103 deletions(-) + +--- x/arch/x86_64/kernel/machine_kexec.c ++++ x/arch/x86_64/kernel/machine_kexec.c +@@ -2,6 +2,10 @@ + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - rewrote identity map code to avoid overwriting current pgd ++ * - moved segment handling code into relocate_kernel.S ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +@@ -96,81 +100,110 @@ out: + } + + +-static int init_pgtable(struct kimage *image, unsigned long start_pgtable) ++static int create_page_table_b(struct kimage *image) + { +- pgd_t *level4p; +- level4p = (pgd_t *)__va(start_pgtable); +- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +-} ++ struct kimage_arch *arch = &image->arch_data; + +-static void set_idt(void *newidt, u16 limit) +-{ +- struct desc_ptr curidt; ++ arch->page_table_b = kimage_alloc_control_pages(image, 0); + +- /* x86-64 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; ++ if (!arch->page_table_b) ++ return -ENOMEM; + +- __asm__ __volatile__ ( +- "lidtq %0\n" +- : : "m" (curidt) +- ); +-}; ++ return init_level4_page(image, page_address(arch->page_table_b), ++ 0, end_pfn << PAGE_SHIFT); ++} + ++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, ++ unsigned long control_code_buffer, ++ unsigned long start_address, ++ unsigned long page_table_a, ++ unsigned long page_table_b) ATTRIB_NORET; ++ ++const extern unsigned char relocate_new_kernel[]; ++const extern unsigned long relocate_new_kernel_size; + +-static void set_gdt(void *newgdt, u16 limit) ++static int allocate_page_table_a(struct kimage *image) + { +- struct desc_ptr curgdt; ++ struct kimage_arch *arch = &image->arch_data; ++ struct page *page; ++ int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]); + +- /* x86-64 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; ++ for (; k > 0; k--) { ++ page = kimage_alloc_control_pages(image, 0); ++ if (!page) ++ return -ENOMEM; + +- __asm__ __volatile__ ( +- "lgdtq %0\n" +- : : "m" (curgdt) +- ); +-}; ++ clear_page(page_address(page)); ++ arch->page_table_a[k - 1] = page; ++ } + +-static void load_segments(void) +-{ +- __asm__ __volatile__ ( +- "\tmovl %0,%%ds\n" +- "\tmovl %0,%%es\n" +- "\tmovl %0,%%ss\n" +- "\tmovl %0,%%fs\n" +- "\tmovl %0,%%gs\n" +- : : "a" (__KERNEL_DS) : "memory" +- ); ++ return 0; + } + +-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, +- unsigned long control_code_buffer, +- unsigned long start_address, +- unsigned long pgtable) ATTRIB_NORET; ++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC ++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */ + +-const extern unsigned char relocate_new_kernel[]; +-const extern unsigned long relocate_new_kernel_size; ++static int create_mapping(struct page *root, struct page **pages, ++ unsigned long va, unsigned long pa) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ int k = 0; ++ ++ pgd = (pgd_t *)page_address(root) + pgd_index(va); ++ if (!pgd_present(*pgd)) ++ set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE)); ++ ++ pud = pud_offset(pgd, va); ++ if (!pud_present(*pud)) ++ set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE)); ++ ++ pmd = pmd_offset(pud, va); ++ if (!pmd_present(*pmd)) ++ set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE)); ++ ++ pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va); ++ set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC)); ++ ++ return k; ++} + + int machine_kexec_prepare(struct kimage *image) + { +- unsigned long start_pgtable, control_code_buffer; +- int result; ++ void *control_page; ++ unsigned long pa; ++ int k; + +- /* Calculate the offsets */ +- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; +- control_code_buffer = start_pgtable + PAGE_SIZE; +- +- /* Setup the identity mapped 64bit page table */ +- result = init_pgtable(image, start_pgtable); +- if (result) +- return result; +- +- /* Place the code in the reboot code buffer */ +- memcpy(__va(control_code_buffer), relocate_new_kernel, +- relocate_new_kernel_size); ++ memset(&image->arch_data, 0, sizeof(image->arch_data)); + +- return 0; ++ k = allocate_page_table_a(image); ++ if (k) ++ return k; ++ ++ /* fill in control_page with assembly code */ ++ ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size); ++ ++ /* map the control_page at the virtual address of relocate_kernel.S */ ++ ++ pa = __pa(control_page); ++ ++ k = create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[1], ++ (unsigned long)relocate_new_kernel, pa); ++ ++ /* identity map the control_page */ ++ ++ create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[k + 1], ++ pa, pa); ++ ++ /* create identity mapped page table aka page_table_b */ ++ ++ return create_page_table_b(image); + } + + void machine_kexec_cleanup(struct kimage *image) +@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage + NORET_TYPE void machine_kexec(struct kimage *image) + { + unsigned long page_list; +- unsigned long control_code_buffer; +- unsigned long start_pgtable; ++ unsigned long control_code; ++ unsigned long page_table_a; ++ unsigned long page_table_b; + relocate_new_kernel_t rnk; + +- /* Interrupts aren't acceptable while we reboot */ +- local_irq_disable(); +- +- /* Calculate the offsets */ + page_list = image->head; +- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; +- control_code_buffer = start_pgtable + PAGE_SIZE; ++ control_code = __pa(page_address(image->control_code_page)); ++ page_table_a = __pa(page_address(image->arch_data.page_table_a[0])); ++ page_table_b = __pa(page_address(image->arch_data.page_table_b)); + +- /* Set the low half of the page table to my identity mapped +- * page table for kexec. Leave the high half pointing at the +- * kernel pages. Don't bother to flush the global pages +- * as that will happen when I fully switch to my identity mapped +- * page table anyway. +- */ +- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); +- __flush_tlb(); +- +- +- /* The segment registers are funny things, they are +- * automatically loaded from a table, in memory wherever you +- * set them to a specific selector, but this table is never +- * accessed again unless you set the segment to a different selector. +- * +- * The more common model are caches where the behide +- * the scenes work is done, but is also dropped at arbitrary +- * times. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); + /* now call it */ +- rnk = (relocate_new_kernel_t) control_code_buffer; +- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); ++ rnk = (relocate_new_kernel_t) relocate_new_kernel; ++ (*rnk)(page_list, control_code, image->start, page_table_a, page_table_b); + } +--- x/arch/x86_64/kernel/relocate_kernel.S ++++ x/arch/x86_64/kernel/relocate_kernel.S +@@ -2,11 +2,18 @@ + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - moved segment handling code from machine_kexec.c ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + #include ++#include ++ ++.text ++.align (1 << PAGE_SHIFT) + + /* + * Must be relocatable PIC code callable as a C function, that once +@@ -18,21 +25,69 @@ relocate_new_kernel: + /* %rdi page_list + * %rsi reboot_code_buffer + * %rdx start address +- * %rcx page_table +- * %r8 arg5 ++ * %rcx page_table_a ++ * %r8 page_table_b + * %r9 arg6 + */ +- ++ + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + ++ /* switch to page_table_a */ ++ movq %rcx, %cr3 ++ ++ /* setup idt */ ++ ++ movq %rsi, %rax ++ addq $(idt_48 - relocate_new_kernel), %rax ++ lidtq (%rax) ++ ++ /* setup gdt */ ++ ++ movq %rsi, %rax ++ addq $(gdt - relocate_new_kernel), %rax ++ movq %rsi, %r9 ++ addq $((gdt_48 - relocate_new_kernel) + 2), %r9 ++ movq %rax, (%r9) ++ ++ movq %rsi, %rax ++ addq $(gdt_48 - relocate_new_kernel), %rax ++ lgdtq (%rax) ++ ++ /* setup data segment registers */ ++ ++ xorl %eax,%eax ++ movl %eax, %ds ++ movl %eax, %es ++ movl %eax, %fs ++ movl %eax, %gs ++ movl %eax, %ss ++ + /* set a new stack at the bottom of our page... */ + lea 4096(%rsi), %rsp + ++ /* load new code segment */ ++ ++ movq %rsp, %rcx ++ xorq %rax, %rax ++ pushq %rax /* SS */ ++ pushq %rcx /* ESP */ ++ pushq %rax /* RFLAGS */ ++ ++ movq $(gdt_code - gdt), %rax ++ pushq %rax /* CS */ ++ ++ movq %rsi, %rax ++ addq $(identity_mapped - relocate_new_kernel), %rax ++ pushq %rax /* RIP */ ++ ++ iretq ++ ++identity_mapped: + /* store the parameters back on the stack */ + pushq %rdx /* store the start address */ +- ++ + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled +@@ -69,7 +124,7 @@ relocate_new_kernel: + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ +- movq %rcx, %cr3 ++ movq %r8, %cr3 + + /* Do the copies */ + movq %rdi, %rcx /* Put the page_list in %rcx */ +@@ -136,6 +191,25 @@ relocate_new_kernel: + xorq %r15, %r15 + + ret ++ .align 16 ++gdt: ++ .long 0x00000000 /* NULL descriptor */ ++ .long 0x00000000 ++gdt_code: ++ .long 0x00000000 /* code descriptor */ ++ .long 0x00209800 ++ ++gdt_end: ++ .align 4 ++ ++idt_48: ++ .word 0 # idt limit = 0 ++ .quad 0, 0 # idt base = 0L ++ ++gdt_48: ++ .word gdt_end - gdt - 1 # gdt limit ++ .quad 0, 0 # gdt base (filled in later) ++ + relocate_new_kernel_end: + + .globl relocate_new_kernel_size +--- x/include/asm-x86_64/kexec.h ++++ x/include/asm-x86_64/kexec.h +@@ -21,15 +21,24 @@ + /* Maximum address we can use for the control pages */ + #define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) + +-/* Allocate one page for the pdp and the second for the code */ +-#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) ++#define KEXEC_CONTROL_CODE_SIZE 4096 + + /* The native architecture */ + #define KEXEC_ARCH KEXEC_ARCH_X86_64 + + #define MAX_NOTE_BYTES 1024 + +-struct kimage_arch {}; ++struct kimage_arch { ++ /* page_table_a[] holds enough pages to create a new page table ++ * that maps the control page twice.. ++ * ++ * page_table_b points to the root page of a page table which is used ++ * to provide identity mapping of all ram. ++ */ ++ ++ struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */ ++ struct page *page_table_b; ++}; + + /* + * Saving the registers of the cpu on which panic occured in --- /dev/null +++ x/patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch @@ -0,0 +1,151 @@ + arch/x86_64/kernel/machine_kexec.c | 84 +++++++++++++++++++++++++++++++++--- + 1 file changed, 77 insertions(+), 7 deletions(-) + +--- x/arch/x86_64/kernel/machine_kexec.c ++++ x/arch/x86_64/kernel/machine_kexec.c +@@ -19,6 +19,50 @@ + #include + #include + ++#ifdef CONFIG_XEN ++#include ++#include ++ ++#define x__pmd(x) ((pmd_t) { (x) } ) ++#define x__pud(x) ((pud_t) { (x) } ) ++#define x__pgd(x) ((pgd_t) { (x) } ) ++ ++#define x_pmd_val(x) ((x).pmd) ++#define x_pud_val(x) ((x).pud) ++#define x_pgd_val(x) ((x).pgd) ++ ++static inline void x_set_pmd(pmd_t *dst, pmd_t val) ++{ ++ x_pmd_val(*dst) = x_pmd_val(val); ++} ++ ++static inline void x_set_pud(pud_t *dst, pud_t val) ++{ ++ x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); ++} ++ ++static inline void x_pud_clear (pud_t *pud) ++{ ++ x_pud_val(*pud) = 0; ++} ++ ++static inline void x_set_pgd(pgd_t *dst, pgd_t val) ++{ ++ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); ++} ++ ++static inline void x_pgd_clear (pgd_t * pgd) ++{ ++ x_pgd_val(*pgd) = 0; ++} ++ ++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE ++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY ++#else ++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC ++#define MY_TABLE _KERNPG_TABLE ++#endif /* CONFIG_XEN */ ++ + static void init_level2_page(pmd_t *level2p, unsigned long addr) + { + unsigned long end_addr; +@@ -26,7 +70,7 @@ static void init_level2_page(pmd_t *leve + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { +- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); ++ x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC)); + addr += PMD_SIZE; + } + } +@@ -51,12 +95,12 @@ static int init_level3_page(struct kimag + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); +- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); ++ x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pud_clear(level3p++); ++ x_pud_clear(level3p++); + addr += PUD_SIZE; + } + out: +@@ -87,12 +131,12 @@ static int init_level4_page(struct kimag + if (result) { + goto out; + } +- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); ++ x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pgd_clear(level4p++); ++ x_pgd_clear(level4p++); + addr += PGDIR_SIZE; + } + out: +@@ -103,14 +147,21 @@ out: + static int create_page_table_b(struct kimage *image) + { + struct kimage_arch *arch = &image->arch_data; ++ unsigned long last_page; + + arch->page_table_b = kimage_alloc_control_pages(image, 0); + + if (!arch->page_table_b) + return -ENOMEM; + ++#ifdef CONFIG_XEN ++ last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); ++#else ++ last_page = end_pfn; ++#endif ++ + return init_level4_page(image, page_address(arch->page_table_b), +- 0, end_pfn << PAGE_SHIFT); ++ 0, last_page << PAGE_SHIFT); + } + + typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, +@@ -211,6 +262,7 @@ void machine_kexec_cleanup(struct kimage + return; + } + ++#ifndef CONFIG_XEN + /* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. +@@ -230,5 +282,23 @@ NORET_TYPE void machine_kexec(struct kim + + /* now call it */ + rnk = (relocate_new_kernel_t) relocate_new_kernel; +- (*rnk)(page_list, control_code, image->start, page_table_a, page_table_b); ++ (*rnk)(page_list, control_code, image->start, page_table_a, ++ page_table_b); ++} ++#endif /* !CONFIG_XEN */ ++ ++#ifdef CONFIG_XEN ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,struct kimage *image) ++{ ++ struct kimage_arch *arch = &image->arch_data; ++ int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]); ++ ++ for (k = 0; k < n; k++) ++ xki->page_table_a[k] = ++ pfn_to_mfn(page_to_pfn(arch->page_table_a[k])) ++ << PAGE_SHIFT; ++ ++ xki->page_table_b = ++ pfn_to_mfn(page_to_pfn(arch->page_table_b)) << PAGE_SHIFT; + } ++#endif /* CONFIG_XEN */