kexec: x86_32 This is the x86_32 component of kexec for xen. The x86 component is a prerequsite for this patch. Signed-Off-By: Horms Signed-Off-By: Magnus Damm buildconfigs/linux-defconfig_xen_x86_32 | 2 linux-2.6-xen-sparse/arch/i386/Kconfig | 2 linux-2.6-xen-sparse/arch/i386/kernel/Makefile | 2 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c | 29 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h | 57 + linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h | 8 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386-xen.patch | 59 + patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386.patch | 457 ++++++++++ patches/linux-2.6.16.13/series | 2 xen/arch/x86/crash.c | 47 + xen/arch/x86/x86_32/entry.S | 2 xen/arch/x86/x86_32/machine_kexec.c | 27 xen/include/asm-x86/x86_32/elf.h | 32 xen/include/asm-x86/x86_32/kexec.h | 65 + 14 files changed, 758 insertions(+), 33 deletions(-) --- x/buildconfigs/linux-defconfig_xen_x86_32 +++ x/buildconfigs/linux-defconfig_xen_x86_32 @@ -184,6 +184,7 @@ CONFIG_MTRR=y CONFIG_REGPARM=y CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 @@ -2775,6 +2776,7 @@ CONFIG_NTFS_FS=m # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y +# CONFIG_PROC_VMCORE is not set CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_HUGETLB_PAGE is not set --- x/linux-2.6-xen-sparse/arch/i386/Kconfig +++ x/linux-2.6-xen-sparse/arch/i386/Kconfig @@ -726,7 +726,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_XEN + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot --- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile +++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile @@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen obj-y += fixup.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o -n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) obj-y := $(call cherrypickxen, $(obj-y)) --- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c +++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c @@ -68,6 +68,10 @@ #include "setup_arch_pre.h" #include +#ifdef CONFIG_XEN +#include +#endif + /* Forward Declaration. */ void __init find_max_pfn(void); @@ -939,6 +943,7 @@ static void __init parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -949,6 +954,10 @@ static void __init parse_cmdline_early ( crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif #ifdef CONFIG_PROC_VMCORE @@ -1318,9 +1327,22 @@ void __init setup_bootmem_allocator(void } #endif #ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN if (crashk_res.start != crashk_res.end) reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); +#else + { + xen_kexec_reserve_t reservation; + BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0, + &reservation)); + if (reservation.size) { + crashk_res.start = reservation.start; + crashk_res.end = reservation.start + + reservation.size - 1; + } + } +#endif #endif if (!xen_feature(XENFEAT_auto_translated_physmap)) @@ -1374,6 +1396,10 @@ legacy_init_iomem_resources(struct e820e res->end = res->start + e820[i].size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; request_resource(&iomem_resource, res); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + if (e820[i].type == E820_RAM) { /* * We don't know which RAM region contains kernel data, @@ -1382,9 +1408,6 @@ legacy_init_iomem_resources(struct e820e */ request_resource(res, code_resource); request_resource(res, data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif } } } --- /dev/null +++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h @@ -0,0 +1,57 @@ +/* + * include/asm-i386/kexec-xen.h + * + * Created By: Horms + */ + +#ifndef _I386_KEXEC_XEN_H +#define _I386_KEXEC_XEN_H + +#include +#include +#include + +static inline void crash_translate_regs(struct pt_regs *linux_regs, + struct cpu_user_regs *xen_regs) +{ + xen_regs->ebx = linux_regs->ebx; + xen_regs->ecx = linux_regs->ecx; + xen_regs->edx = linux_regs->edx; + xen_regs->esi = linux_regs->esi; + xen_regs->edi = linux_regs->edi; + xen_regs->ebp = linux_regs->ebp; + xen_regs->eax = linux_regs->eax; + xen_regs->esp = linux_regs->esp; + xen_regs->ss = linux_regs->xss; + xen_regs->cs = linux_regs->xcs; + xen_regs->ds = linux_regs->xds; + xen_regs->es = linux_regs->xes; + xen_regs->eflags = linux_regs->eflags; +} + +/* Kexec needs to know about the actual physical addresss. + * But in xen, on some architectures, a physical address is a + * pseudo-physical addresss. */ +#ifdef CONFIG_XEN +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) +#define kexec_virt_to_phys(addr) virt_to_machine(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) +#else +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + +#endif /* _I386_KEXEC_XEN_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ --- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h +++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h @@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec( + unsigned long op, unsigned int arg1, void * extra_args) +{ + return _hypercall3(int, kexec_op, op, arg1, extra_args); +} + + #endif /* __HYPERCALL_H__ */ --- x/xen/arch/x86/crash.c +++ x/xen/arch/x86/crash.c @@ -21,6 +21,7 @@ #include #include #include +#include #include static int crashing_cpu; @@ -169,6 +170,51 @@ static void nmi_shootdown_cpus(void) } #endif +/* The cr3 for dom0 on each of its vcpus + * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where + * prstatus is the data of the elf note, and ELF_NGREG was extended + * by one to allow extra space. + * This code runs after all cpus except the crashing one have + * been shutdown so as to avoid having to hold domlist_lock, + * as locking after a crash is playing with fire */ +void find_dom0_cr3(void) +{ + struct domain *d; + struct vcpu *v; + uint32_t *buf; + uint32_t cr3; + Elf_Note note; + + /* Don't need to grab domlist_lock as we are the only thing running */ + + /* No need to traverse domain_list, as dom0 is always first */ + d = domain_list; + BUG_ON(d->domain_id); + + for_each_vcpu ( d, v ) { + if ( test_bit(_VCPUF_down, &v->vcpu_flags) ) + continue; + buf = (uint32_t *)per_cpu(crash_notes, v->processor); + if (!buf) /* XXX: Can this ever occur? */ + continue; + + memcpy(¬e, buf, sizeof(Elf_Note)); + buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 + + (note.descsz + 3)/4; + + /* XXX: This probably doesn't take into account shadow mode, + * but that might not be a problem */ + cr3 = pagetable_get_pfn(v->arch.guest_table); + + buf = append_elf_note(buf, "Xen Domanin-0 CR3", + NT_XEN_DOM0_CR3, &cr3, 4); + final_note(buf); + + printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", + d->domain_id, v->vcpu_id, v->processor, cr3); + } +} + void machine_crash_shutdown(struct cpu_user_regs *regs) { printk("machine_crash_shutdown: %d\n", smp_processor_id()); @@ -180,6 +226,7 @@ void machine_crash_shutdown(struct cpu_u disable_IO_APIC(); #endif crash_save_self(regs); + find_dom0_cr3(); } /* --- x/xen/arch/x86/x86_32/entry.S +++ x/xen/arch/x86/x86_32/entry.S @@ -660,6 +660,7 @@ ENTRY(hypercall_table) .long do_hvm_op .long do_sysctl /* 35 */ .long do_domctl + .long do_kexec_op .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -702,6 +703,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_hvm_op */ .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ + .byte 1 /* do_kexec_op */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr --- x/xen/arch/x86/x86_32/machine_kexec.c +++ x/xen/arch/x86/x86_32/machine_kexec.c @@ -1,18 +1,31 @@ -/* +/****************************************************************************** * arch/x86/x86_32/machine_kexec.c - * Handle transition of Linux booting another kernel - * - * Created By: Horms + * + * Created By: Horms * - * Should be losely based on arch/i386/kernel/machine_kexec.c + * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16 */ -#include /* for printk() used in stub */ +#include #include +#include +#include + +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned long page_table_a, + unsigned long has_pae); void machine_kexec(xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + relocate_new_kernel_t rnk; + + rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE); + (*rnk)(image->indirection_page, image->reboot_code_buffer, + image->start_address, image->page_table_a[0], + (unsigned long)cpu_has_pae); } /* --- x/xen/include/asm-x86/x86_32/elf.h +++ x/xen/include/asm-x86/x86_32/elf.h @@ -3,17 +3,39 @@ * * Created By: Horms * - * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS - * from Linux 2.6.16 + * Based heavily on include/asm-i386/elf.h and + * include/asm-i386/system.h from Linux 2.6.16 */ #ifndef __X86_ELF_X86_32_H__ #define __X86_ELF_X86_32_H__ -#include /* for printk() used in stub */ +/* XXX: Xen doesn't have orig_eax. For kdump, on a dom0 crash, the values + * for the crashing CPU could could be passed down from dom0, but is that + * neccessary? + * Also, I'm not sure why fs and gs are derived from the CPU + * rather than regs */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + unsigned i; \ + pr_reg[0] = regs->ebx; \ + pr_reg[1] = regs->ecx; \ + pr_reg[2] = regs->edx; \ + pr_reg[3] = regs->esi; \ + pr_reg[4] = regs->edi; \ + pr_reg[5] = regs->ebp; \ + pr_reg[6] = regs->eax; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9] = i; \ + asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i; \ + pr_reg[11] = 0; /* regs->orig_eax; */ \ + pr_reg[12] = regs->eip; \ + pr_reg[13] = regs->cs; \ + pr_reg[14] = regs->eflags; \ + pr_reg[15] = regs->esp; \ + pr_reg[16] = regs->ss; \ +} while(0); #endif /* __X86_ELF_X86_32_H__ */ --- x/xen/include/asm-x86/x86_32/kexec.h +++ x/xen/include/asm-x86/x86_32/kexec.h @@ -3,39 +3,72 @@ * * Created By: Horms * - * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16 - * + * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16 */ -#ifndef __X86_32_KEXEC_H__ -#define __X86_32_KEXEC_H__ - -#include /* for printk() used in stub */ -#include -#include +#ifndef __X86_KEXEC_X86_32_H__ +#define __X86_KEXEC_X86_32_H__ +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ static void crash_fixup_ss_esp(struct cpu_user_regs *newregs, - struct cpu_user_regs *oldregs) + struct cpu_user_regs *oldregs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); - return; - crash_fixup_ss_esp(newregs, oldregs); + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__( + "xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->ss)); } +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and esp if coming via kernel + * mode exception. + */ static void crash_setup_regs(struct cpu_user_regs *newregs, struct cpu_user_regs *oldregs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + if (oldregs) + crash_fixup_ss_esp(newregs, oldregs); + else { + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); + + newregs->eip = (unsigned long)current_text_addr(); + } } +/* + * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h + * + * user_mode_vm(regs) determines whether a register set came from user mode. + * This is true if V8086 mode was enabled OR if the register set was from + * protected mode with RPL-3 CS value. This tricky test checks that with + * one comparison. Many places in the kernel can bypass this full check + * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + */ static inline int user_mode(struct cpu_user_regs *regs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); - return -1; + return (regs->cs & 2) != 0; } -#endif /* __X86_32_KEXEC_H__ */ +#endif /* __X86_KEXEC_X86_32_H__ */ /* * Local variables: --- x/patches/linux-2.6.16.13/series +++ x/patches/linux-2.6.16.13/series @@ -1,5 +1,7 @@ kexec-generic.patch linux-2.6.16-kexec_page_table_a_stubs.patch +linux-2.6.16-kexec_page_table_a_i386.patch +linux-2.6.16-kexec_page_table_a_i386-xen.patch blktap-aio-16_03_06.patch device_bind.patch fix-hz-suspend.patch --- /dev/null +++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386.patch @@ -0,0 +1,457 @@ +kexec: Avoid overwriting the current pgd (V2, i386) + +This patch upgrades the i386-specific kexec code to avoid overwriting the +current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used +to start a secondary kernel that dumps the memory of the previous kernel. + +The code introduces a new set of page tables called "page_table_a". These +tables are used to provide an executable identity mapping without overwriting +the current pgd. This updated version of the patch fixes a PAE bug and moves +the segment handling code into the reloacte_kernel.S. + +Signed-off-by: Magnus Damm +--- + + The patch has been tested with regular kexec and CONFIG_CRASH_DUMP. + Both PAE and non-PAE configurations work well. + Applies on top of 2.6.16 and 2.6.17-rc4. + + arch/i386/kernel/machine_kexec.c | 230 ++++++++++++++---------------------- + arch/i386/kernel/relocate_kernel.S | 92 ++++++++++++++ + include/asm-i386/kexec.h | 12 + + 3 files changed, 192 insertions(+), 142 deletions(-) + +--- x/arch/i386/kernel/machine_kexec.c ++++ x/arch/i386/kernel/machine_kexec.c +@@ -2,6 +2,10 @@ + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - rewrote identity map code to avoid overwriting current pgd ++ * - moved segment handling code into relocate_kernel.S ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +@@ -19,123 +23,73 @@ + #include + #include + +-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +- +-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +-#define L2_ATTR (_PAGE_PRESENT) +- +-#define LEVEL0_SIZE (1UL << 12UL) ++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( ++ unsigned long indirection_page, ++ unsigned long reboot_code_buffer, ++ unsigned long start_address, ++ unsigned long page_table_a, ++ unsigned long has_pae) ATTRIB_NORET; + +-#ifndef CONFIG_X86_PAE +-#define LEVEL1_SIZE (1UL << 22UL) +-static u32 pgtable_level1[1024] PAGE_ALIGNED; ++const extern unsigned char relocate_new_kernel[]; ++extern void relocate_new_kernel_end(void); ++const extern unsigned int relocate_new_kernel_size; + +-static void identity_map_page(unsigned long address) ++static int allocate_page_table_a(struct kimage *image) + { +- unsigned long level1_index, level2_index; +- u32 *pgtable_level2; +- +- /* Find the current page table */ +- pgtable_level2 = __va(read_cr3()); ++ struct kimage_arch *arch = &image->arch_data; ++ struct page *page; ++ int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]); ++ ++ for (; k > 0; k--) { ++ page = kimage_alloc_control_pages(image, 0); ++ if (!page) ++ return -ENOMEM; ++ ++ clear_page(page_address(page)); ++ arch->page_table_a[k - 1] = page; ++ } + +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = address / LEVEL1_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level2); ++ return 0; + } + +-#else +-#define LEVEL1_SIZE (1UL << 21UL) +-#define LEVEL2_SIZE (1UL << 30UL) +-static u64 pgtable_level1[512] PAGE_ALIGNED; +-static u64 pgtable_level2[512] PAGE_ALIGNED; +- +-static void identity_map_page(unsigned long address) +-{ +- unsigned long level1_index, level2_index, level3_index; +- u64 *pgtable_level3; ++/* workaround for include/asm-i386/pgtable-3level.h */ + +- /* Find the current page table */ +- pgtable_level3 = __va(read_cr3()); +- +- /* Find the indexes of the physical address to identity map */ +- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; +- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; +- level3_index = address / LEVEL2_SIZE; +- +- /* Identity map the page table entry */ +- pgtable_level1[level1_index] = address | L0_ATTR; +- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; +- set_64bit(&pgtable_level3[level3_index], +- __pa(pgtable_level2) | L2_ATTR); +- +- /* Flush the tlb so the new mapping takes effect. +- * Global tlb entries are not flushed but that is not an issue. +- */ +- load_cr3(pgtable_level3); +-} ++#ifdef CONFIG_X86_PAE ++#undef pgd_present ++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) ++#define _PGD_ATTR _PAGE_PRESENT ++#else ++#define _PGD_ATTR _KERNPG_TABLE + #endif + +-static void set_idt(void *newidt, __u16 limit) +-{ +- struct Xgt_desc_struct curidt; +- +- /* ia32 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; +- +- load_idt(&curidt); +-}; ++#define pa_page(page) __pa(page_address(page)) + +- +-static void set_gdt(void *newgdt, __u16 limit) ++static int create_mapping(struct page *root, struct page **pages, ++ unsigned long va, unsigned long pa) + { +- struct Xgt_desc_struct curgdt; +- +- /* ia32 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ int k = 0; + +- load_gdt(&curgdt); +-}; ++ pgd = (pgd_t *)page_address(root) + pgd_index(va); ++ if (!pgd_present(*pgd)) ++ set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR)); + +-static void load_segments(void) +-{ +-#define __STR(X) #X +-#define STR(X) __STR(X) ++ pud = pud_offset(pgd, va); ++ if (!pud_present(*pud)) ++ set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE)); + +- __asm__ __volatile__ ( +- "\tljmp $"STR(__KERNEL_CS)",$1f\n" +- "\t1:\n" +- "\tmovl $"STR(__KERNEL_DS)",%%eax\n" +- "\tmovl %%eax,%%ds\n" +- "\tmovl %%eax,%%es\n" +- "\tmovl %%eax,%%fs\n" +- "\tmovl %%eax,%%gs\n" +- "\tmovl %%eax,%%ss\n" +- ::: "eax", "memory"); +-#undef STR +-#undef __STR +-} ++ pmd = pmd_offset(pud, va); ++ if (!pmd_present(*pmd)) ++ set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE)); + +-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( +- unsigned long indirection_page, +- unsigned long reboot_code_buffer, +- unsigned long start_address, +- unsigned int has_pae) ATTRIB_NORET; ++ pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va); ++ set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC)); + +-const extern unsigned char relocate_new_kernel[]; +-extern void relocate_new_kernel_end(void); +-const extern unsigned int relocate_new_kernel_size; ++ return k; ++} + + /* + * A architecture hook called to validate the +@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. +- * +- * Currently nothing. + */ + int machine_kexec_prepare(struct kimage *image) + { ++ void *control_page; ++ unsigned long pa; ++ int k; ++ ++ memset(&image->arch_data, 0, sizeof(image->arch_data)); ++ ++ k = allocate_page_table_a(image); ++ if (k) ++ return k; ++ ++ /* fill in control_page with assembly code */ ++ ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size); ++ ++ /* map the control_page at the virtual address of relocate_kernel.S */ ++ ++ pa = __pa(control_page); ++ ++ k = create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[1], ++ (unsigned long)relocate_new_kernel, pa); ++ ++ /* identity map the control_page */ ++ ++ create_mapping(image->arch_data.page_table_a[0], ++ &image->arch_data.page_table_a[k + 1], ++ pa, pa); ++ + return 0; + } + +@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage + NORET_TYPE void machine_kexec(struct kimage *image) + { + unsigned long page_list; +- unsigned long reboot_code_buffer; +- ++ unsigned long control_code; ++ unsigned long page_table_a; + relocate_new_kernel_t rnk; + +- /* Interrupts aren't acceptable while we reboot */ +- local_irq_disable(); +- +- /* Compute some offsets */ +- reboot_code_buffer = page_to_pfn(image->control_code_page) +- << PAGE_SHIFT; + page_list = image->head; +- +- /* Set up an identity mapping for the reboot_code_buffer */ +- identity_map_page(reboot_code_buffer); +- +- /* copy it out */ +- memcpy((void *)reboot_code_buffer, relocate_new_kernel, +- relocate_new_kernel_size); +- +- /* The segment registers are funny things, they are +- * automatically loaded from a table, in memory wherever you +- * set them to a specific selector, but this table is never +- * accessed again you set the segment to a different selector. +- * +- * The more common model is are caches where the behide +- * the scenes work is done, but is also dropped at arbitrary +- * times. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); ++ control_code = __pa(page_address(image->control_code_page)); ++ page_table_a = __pa(page_address(image->arch_data.page_table_a[0])); + + /* now call it */ +- rnk = (relocate_new_kernel_t) reboot_code_buffer; +- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); ++ rnk = (relocate_new_kernel_t) relocate_new_kernel; ++ (*rnk)(page_list, control_code, image->start, ++ page_table_a, (unsigned long)cpu_has_pae); + } +--- x/arch/i386/kernel/relocate_kernel.S ++++ x/arch/i386/kernel/relocate_kernel.S +@@ -2,12 +2,20 @@ + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * ++ * 2006-05-19 Magnus Damm : ++ * - moved segment handling code from machine_kexec.c ++ * - gdt tables stolen from arch/i386/boot/setup.S ++ * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + #include ++#include + ++.text ++.align (1 << PAGE_SHIFT) ++ + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. +@@ -18,18 +26,68 @@ relocate_new_kernel: + movl 4(%esp), %ebx /* page_list */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ +- movl 16(%esp), %ecx /* cpu_has_pae */ ++ movl 16(%esp), %edi /* page_table_a */ ++ movl 20(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + ++ /* switch to page_table_a */ ++ movl %edi, %eax ++ movl %eax, %cr3 ++ ++ /* setup idt */ ++ ++ movl %ebp, %eax ++ addl $(idt_48 - relocate_new_kernel), %eax ++ lidtl (%eax) ++ ++ /* setup gdt */ ++ ++ movl %ebp, %eax ++ addl $(gdt - relocate_new_kernel), %eax ++ movl %ebp, %esi ++ addl $((gdt_48 - relocate_new_kernel) + 2), %esi ++ movl %eax, (%esi) ++ ++ movl %ebp, %eax ++ addl $(gdt_48 - relocate_new_kernel), %eax ++ lgdtl (%eax) ++ ++ /* setup data segment registers */ ++ ++ mov $(gdt_ds - gdt), %eax ++ mov %eax, %ds ++ mov %eax, %es ++ mov %eax, %fs ++ mov %eax, %gs ++ mov %eax, %ss ++ + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + ++ /* load new code segment */ ++ ++ movl %ebp, %esi ++ xorl %eax, %eax ++ pushl %eax ++ pushl %esi ++ pushl %eax ++ ++ movl $(gdt_cs - gdt), %eax ++ pushl %eax ++ ++ movl %ebp, %eax ++ addl $(identity_mapped - relocate_new_kernel),%eax ++ pushl %eax ++ iretl ++ ++identity_mapped: ++ + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ +- ++ + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled +@@ -113,6 +171,36 @@ relocate_new_kernel: + xorl %edi, %edi + xorl %ebp, %ebp + ret ++ ++ .align 16 ++gdt: ++ .fill 1,8,0 ++ ++gdt_cs: ++ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) ++ .word 0 # base address = 0 ++ .word 0x9A00 # code read/exec ++ .word 0x00CF # granularity = 4096, 386 ++ # (+5th nibble of limit) ++gdt_ds: ++ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) ++ .word 0 # base address = 0 ++ .word 0x9200 # data read/write ++ .word 0x00CF # granularity = 4096, 386 ++ # (+5th nibble of limit) ++gdt_end: ++ .align 4 ++ ++ .word 0 # alignment byte ++idt_48: ++ .word 0 # idt limit = 0 ++ .word 0, 0 # idt base = 0L ++ ++ .word 0 # alignment byte ++gdt_48: ++ .word gdt_end - gdt - 1 # gdt limit ++ .word 0, 0 # gdt base (filled in later) ++ + relocate_new_kernel_end: + + .globl relocate_new_kernel_size +--- x/include/asm-i386/kexec.h ++++ x/include/asm-i386/kexec.h +@@ -29,7 +29,17 @@ + + #define MAX_NOTE_BYTES 1024 + +-struct kimage_arch {}; ++struct kimage_arch { ++ /* page_table_a[] holds enough pages to create a new page table ++ * that maps the control page twice.. ++ */ ++ ++#if defined(CONFIG_X86_PAE) ++ struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */ ++#else ++ struct page *page_table_a[3]; /* (2 * pte) + pgd */ ++#endif ++}; + + /* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code --- /dev/null +++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386-xen.patch @@ -0,0 +1,59 @@ +kexec: xen specific portions of the page table a patch for kexec + +Signed-off-by: Magnus Damm + + arch/i386/kernel/machine_kexec.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +--- x/arch/i386/kernel/machine_kexec.c 2006-07-12 13:16:20.000000000 +0900 ++++ x/arch/i386/kernel/machine_kexec.c 2006-07-12 13:16:38.000000000 +0900 +@@ -23,15 +23,23 @@ + #include + #include + ++#ifdef CONFIG_XEN ++#include ++#endif ++ ++#ifndef CONFIG_XEN + typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned long page_table_a, + unsigned long has_pae) ATTRIB_NORET; ++#endif + + const extern unsigned char relocate_new_kernel[]; ++#ifndef CONFIG_XEN + extern void relocate_new_kernel_end(void); ++#endif + const extern unsigned int relocate_new_kernel_size; + + static int allocate_page_table_a(struct kimage *image) +@@ -144,6 +152,7 @@ + { + } + ++#ifndef CONFIG_XEN + /* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. +@@ -164,3 +173,17 @@ + (*rnk)(page_list, control_code, image->start, + page_table_a, (unsigned long)cpu_has_pae); + } ++#endif ++ ++#ifdef CONFIG_XEN ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ struct kimage_arch *arch = &image->arch_data; ++ int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]); ++ ++ for (k = 0; k < n; k++) ++ xki->page_table_a[k] = ++ pfn_to_mfn(page_to_pfn(arch->page_table_a[k])) ++ << PAGE_SHIFT; ++} ++#endif