WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 04/04] Kexec / Kdump: x86_64 specific code

To: Keir Fraser <Keir.Fraser@xxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 04/04] Kexec / Kdump: x86_64 specific code
From: Magnus Damm <magnus@xxxxxxxxxxxxx>
Date: Mon, 23 Oct 2006 18:05:49 +0900
Cc: Ian Pratt <m+Ian.Pratt@xxxxxxxxxxxx>, Kazuo Moriwaka <moriwaka@xxxxxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx, Akio Takebe <takebe_akio@xxxxxxxxxxxxxx>, magnus.damm@xxxxxxxxx, Isaku Yamahata <yamahata@xxxxxxxxxxxxx>, Magnus Damm <magnus@xxxxxxxxxxxxx>, Horms <horms@xxxxxxxxxxxx>
Delivery-date: Tue, 24 Oct 2006 08:10:29 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <20061023090515.26706.69407.sendpatchset@localhost>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <20061023090515.26706.69407.sendpatchset@localhost>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
[PATCH 04/04] Kexec / Kdump: x86_64 specific code

This patch contains the x86_64 implementation of Kexec / Kdump for Xen.

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---

 Applies on top of xen-unstable-11856.

 buildconfigs/linux-defconfig_xen_x86_64                           |    1
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                          |    2
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                  |    2
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c               |   27
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h               |   64 +
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h  |    7
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h     |    2
 patches/linux-2.6.16.29/git-4b...1f.patch                         |  375 ++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-x86_64.patch |  161 ++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch   |  162 ++
 patches/linux-2.6.16.29/series                                    |    3
 xen/arch/x86/x86_64/entry.S                                       |    2
 xen/include/asm-x86/x86_64/elf.h                                  |   49 +
 xen/include/asm-x86/x86_64/kexec.h                                |   60 +
 14 files changed, 903 insertions(+), 14 deletions(-)

--- 0002/buildconfigs/linux-defconfig_xen_x86_64
+++ work/buildconfigs/linux-defconfig_xen_x86_64        2006-10-23 
11:36:17.000000000 +0900
@@ -138,6 +138,7 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- 0001/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ work/linux-2.6-xen-sparse/arch/x86_64/Kconfig       2006-10-23 
11:36:17.000000000 +0900
@@ -435,7 +435,7 @@ config X86_MCE_AMD
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_64_XEN
+       depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile       2006-10-23 
11:36:17.000000000 +0900
@@ -59,7 +59,7 @@ pci-dma-y                     += ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := 
../../i386/kernel/microcode-xen.o
 quirks-y                       := ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- 0001/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c    2006-10-23 
11:36:17.000000000 +0900
@@ -80,6 +80,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -450,6 +454,7 @@ static __init void parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -460,6 +465,10 @@ static __init void parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 
@@ -812,10 +821,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif /* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end) {
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
        }
+#else
+       {
+               xen_kexec_reserve_t reservation;
+               BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve,
+                                       &reservation));
+               if (reservation.size) {
+                       crashk_res.start = reservation.start;
+                       crashk_res.end = reservation.start + 
+                               reservation.size - 1;
+               }
+       }
+#endif
 #endif
 
        paging_init();
@@ -954,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
        iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+       if (crashk_res.start != crashk_res.end)
+               request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
        {
                struct physdev_set_iopl set_iopl;
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h    2006-10-23 
11:36:18.000000000 +0900
@@ -0,0 +1,64 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+                                       struct cpu_user_regs *xen_regs)
+{
+       xen_regs->r15 = linux_regs->r15;
+       xen_regs->r14 = linux_regs->r14;
+       xen_regs->r13 = linux_regs->r13;
+       xen_regs->r12 = linux_regs->r12;
+       xen_regs->rbp = linux_regs->rbp;
+       xen_regs->rbx = linux_regs->rbx;
+       xen_regs->r11 = linux_regs->r11;
+       xen_regs->r10 = linux_regs->r10;
+       xen_regs->r9 = linux_regs->r9;
+       xen_regs->r8 = linux_regs->r8;
+       xen_regs->rax = linux_regs->rax;
+       xen_regs->rcx = linux_regs->rcx;
+       xen_regs->rdx = linux_regs->rdx;
+       xen_regs->rsi = linux_regs->rsi;
+       xen_regs->rdi = linux_regs->rdi;
+       xen_regs->rip = linux_regs->rip;
+       xen_regs->cs = linux_regs->cs;
+       xen_regs->rflags = linux_regs->eflags;
+       xen_regs->rsp = linux_regs->rsp;
+       xen_regs->ss = linux_regs->ss;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h       
2006-10-23 11:36:17.000000000 +0900
@@ -386,4 +386,11 @@ HYPERVISOR_xenoprof_op(
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+       unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- 0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h  
2006-10-23 11:36:17.000000000 +0900
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch 
    2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,375 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64)
+X-Git-Tag: v2.6.19-rc1
+X-Git-Url: 
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f
+
+[PATCH] Avoid overwriting the current pgd (V4, x86_64)
+
+kexec: Avoid overwriting the current pgd (V4, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/x86_64/kernel/machine_kexec.c
++++ b/arch/x86_64/kernel/machine_kexec.c
+@@ -15,6 +15,15 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
++static u64 kexec_pgd[512] PAGE_ALIGNED;
++static u64 kexec_pud0[512] PAGE_ALIGNED;
++static u64 kexec_pmd0[512] PAGE_ALIGNED;
++static u64 kexec_pte0[512] PAGE_ALIGNED;
++static u64 kexec_pud1[512] PAGE_ALIGNED;
++static u64 kexec_pmd1[512] PAGE_ALIGNED;
++static u64 kexec_pte1[512] PAGE_ALIGNED;
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+       unsigned long end_addr;
+@@ -144,32 +153,19 @@ static void load_segments(void)
+               );
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long 
indirection_page,
+-                                      unsigned long control_code_buffer,
+-                                      unsigned long start_address,
+-                                      unsigned long pgtable) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern const unsigned long relocate_new_kernel_size;
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-      unsigned long start_pgtable, control_code_buffer;
++      unsigned long start_pgtable;
+       int result;
+ 
+       /* Calculate the offsets */
+       start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-      control_code_buffer = start_pgtable + PAGE_SIZE;
+ 
+       /* Setup the identity mapped 64bit page table */
+       result = init_pgtable(image, start_pgtable);
+       if (result)
+               return result;
+ 
+-      /* Place the code in the reboot code buffer */
+-      memcpy(__va(control_code_buffer), relocate_new_kernel,
+-                                              relocate_new_kernel_size);
+-
+       return 0;
+ }
+ 
+@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage
+  */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+-      unsigned long page_list;
+-      unsigned long control_code_buffer;
+-      unsigned long start_pgtable;
+-      relocate_new_kernel_t rnk;
++      unsigned long page_list[PAGES_NR];
++      void *control_page;
+ 
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+ 
+-      /* Calculate the offsets */
+-      page_list = image->head;
+-      start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-      control_code_buffer = start_pgtable + PAGE_SIZE;
++      control_page = page_address(image->control_code_page) + PAGE_SIZE;
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ 
+-      /* Set the low half of the page table to my identity mapped
+-       * page table for kexec.  Leave the high half pointing at the
+-       * kernel pages.   Don't bother to flush the global pages
+-       * as that will happen when I fully switch to my identity mapped
+-       * page table anyway.
+-       */
+-      memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-      __flush_tlb();
++      page_list[PA_CONTROL_PAGE] = __pa(control_page);
++      page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++      page_list[PA_PGD] = __pa(kexec_pgd);
++      page_list[VA_PGD] = (unsigned long)kexec_pgd;
++      page_list[PA_PUD_0] = __pa(kexec_pud0);
++      page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
++      page_list[PA_PMD_0] = __pa(kexec_pmd0);
++      page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++      page_list[PA_PTE_0] = __pa(kexec_pte0);
++      page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++      page_list[PA_PUD_1] = __pa(kexec_pud1);
++      page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
++      page_list[PA_PMD_1] = __pa(kexec_pmd1);
++      page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++      page_list[PA_PTE_1] = __pa(kexec_pte1);
++      page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
++      page_list[PA_TABLE_PAGE] =
++        (unsigned long)__pa(page_address(image->control_code_page));
+ 
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+@@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kim
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
++
+       /* now call it */
+-      rnk = (relocate_new_kernel_t) control_code_buffer;
+-      (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++      relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++                      image->start);
+ }
+ 
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/x86_64/kernel/relocate_kernel.S
++++ b/arch/x86_64/kernel/relocate_kernel.S
+@@ -7,31 +7,169 @@
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
+ 
+-      /*
+-       * Must be relocatable PIC code callable as a C function, that once
+-       * it starts can not use the previous processes stack.
+-       */
+-      .globl relocate_new_kernel
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 3)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++
++      .text
++      .align PAGE_ALIGNED
+       .code64
++      .globl relocate_kernel
++relocate_kernel:
++      /* %rdi indirection_page
++       * %rsi page_list
++       * %rdx start address
++       */
++
++      /* map the control page at its virtual address */
++
++      movq    $0x0000ff8000000000, %r10        /* mask */
++      mov     $(39 - 3), %cl                   /* bits to shift */
++      movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PGD)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PUD_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PUD_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PMD_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PMD_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PTE_0)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PTE_0)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      /* identity map the control page at its physical address */
++
++      movq    $0x0000ff8000000000, %r10        /* mask */
++      mov     $(39 - 3), %cl                   /* bits to shift */
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PGD)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PUD_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PUD_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PMD_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PMD_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_PTE_1)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
++      shrq    $9, %r10
++      sub     $9, %cl
++
++      movq    %r11, %r9
++      andq    %r10, %r9
++      shrq    %cl, %r9
++
++      movq    PTR(VA_PTE_1)(%rsi), %r8
++      addq    %r8, %r9
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++      orq     $PAGE_ATTR, %r8
++      movq    %r8, (%r9)
++
+ relocate_new_kernel:
+-      /* %rdi page_list
+-       * %rsi reboot_code_buffer
++      /* %rdi indirection_page
++       * %rsi page_list
+        * %rdx start address
+-       * %rcx page_table
+-       * %r8  arg5
+-       * %r9  arg6
+        */
+ 
+       /* zero out flags, and disable interrupts */
+       pushq $0
+       popfq
+ 
+-      /* set a new stack at the bottom of our page... */
+-      lea   4096(%rsi), %rsp
++      /* get physical address of control page now */
++      /* this is impossible after page table switch */
++      movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
++
++      /* get physical address of page table now too */
++      movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
++
++      /* switch to new set of page tables */
++      movq    PTR(PA_PGD)(%rsi), %r9
++      movq    %r9, %cr3
++
++      /* setup a new stack at the end of the physical control page */
++      lea     4096(%r8), %rsp
++
++      /* jump to identity mapped page */
++      addq    $(identity_mapped - relocate_kernel), %r8
++      pushq   %r8
++      ret
+ 
+-      /* store the parameters back on the stack */
+-      pushq   %rdx /* store the start address */
++identity_mapped:
++      /* store the start address on the stack */
++      pushq   %rdx
+ 
+       /* Set cr0 to a known state:
+        * 31 1 == Paging enabled
+@@ -136,8 +274,3 @@ relocate_new_kernel:
+       xorq    %r15, %r15
+ 
+       ret
+-relocate_new_kernel_end:
+-
+-      .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+-      .quad relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-x86_64/kexec.h
++++ b/include/asm-x86_64/kexec.h
+@@ -1,6 +1,27 @@
+ #ifndef _X86_64_KEXEC_H
+ #define _X86_64_KEXEC_H
+ 
++#define PA_CONTROL_PAGE  0
++#define VA_CONTROL_PAGE  1
++#define PA_PGD           2
++#define VA_PGD           3
++#define PA_PUD_0         4
++#define VA_PUD_0         5
++#define PA_PMD_0         6
++#define VA_PMD_0         7
++#define PA_PTE_0         8
++#define VA_PTE_0         9
++#define PA_PUD_1         10
++#define VA_PUD_1         11
++#define PA_PMD_1         12
++#define VA_PMD_1         13
++#define PA_PTE_1         14
++#define VA_PTE_1         15
++#define PA_TABLE_PAGE    16
++#define PAGES_NR         17
++
++#ifndef __ASSEMBLY__
++
+ #include <linux/string.h>
+ 
+ #include <asm/page.h>
+@@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru
+               newregs->rip = (unsigned long)current_text_addr();
+       }
+ }
++
++NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++              unsigned long page_list,
++              unsigned long start_address) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
++
+ #endif /* _X86_64_KEXEC_H */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
  2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,161 @@
+kexec: Move asm segment handling code to the assembly file (x86_64)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S.  The main reason behind this move is to avoid code 
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c   |   58 -----------------------------------------------------
+ relocate_kernel.S |   50 +++++++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 45 insertions(+), 63 deletions(-)
+
+--- 0002/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c    2006-10-05 16:15:49.000000000 
+0900
+@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i
+       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ }
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-      struct desc_ptr curidt;
+-
+-      /* x86-64 supports unaliged loads & stores */
+-      curidt.size    = limit;
+-      curidt.address = (unsigned long)newidt;
+-
+-      __asm__ __volatile__ (
+-              "lidtq %0\n"
+-              : : "m" (curidt)
+-              );
+-};
+-
+-
+-static void set_gdt(void *newgdt, u16 limit)
+-{
+-      struct desc_ptr curgdt;
+-
+-      /* x86-64 supports unaligned loads & stores */
+-      curgdt.size    = limit;
+-      curgdt.address = (unsigned long)newgdt;
+-
+-      __asm__ __volatile__ (
+-              "lgdtq %0\n"
+-              : : "m" (curgdt)
+-              );
+-};
+-
+-static void load_segments(void)
+-{
+-      __asm__ __volatile__ (
+-              "\tmovl %0,%%ds\n"
+-              "\tmovl %0,%%es\n"
+-              "\tmovl %0,%%ss\n"
+-              "\tmovl %0,%%fs\n"
+-              "\tmovl %0,%%gs\n"
+-              : : "a" (__KERNEL_DS) : "memory"
+-              );
+-}
+-
+ int machine_kexec_prepare(struct kimage *image)
+ {
+       unsigned long start_pgtable;
+@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim
+       page_list[PA_TABLE_PAGE] =
+         (unsigned long)__pa(page_address(image->control_code_page));
+ 
+-      /* The segment registers are funny things, they have both a
+-       * visible and an invisible part.  Whenever the visible part is
+-       * set to a specific selector, the invisible part is loaded
+-       * with from a table in memory.  At no other time is the
+-       * descriptor table in memory accessed.
+-       *
+-       * I take advantage of this here by force loading the
+-       * segments, before I zap the gdt with an invalid value.
+-       */
+-      load_segments();
+-      /* The gdt & idt are now invalid.
+-       * If you want to load them you must set up your own idt & gdt.
+-       */
+-      set_gdt(phys_to_virt(0),0);
+-      set_idt(phys_to_virt(0),0);
+-
+-      /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start);
+ }
+--- 0002/arch/x86_64/kernel/relocate_kernel.S
++++ work/arch/x86_64/kernel/relocate_kernel.S  2006-10-05 16:18:07.000000000 
+0900
+@@ -159,13 +159,39 @@ relocate_new_kernel:
+       movq    PTR(PA_PGD)(%rsi), %r9
+       movq    %r9, %cr3
+ 
++      /* setup idt */
++      movq    %r8, %rax
++      addq    $(idt_80 - relocate_kernel), %rax
++      lidtq   (%rax)
++
++      /* setup gdt */
++      movq    %r8, %rax
++      addq    $(gdt - relocate_kernel), %rax
++      movq    %r8, %r9
++      addq    $((gdt_80 - relocate_kernel) + 2), %r9
++      movq    %rax, (%r9)
++
++      movq    %r8, %rax
++      addq    $(gdt_80 - relocate_kernel), %rax
++      lgdtq   (%rax)
++
++      /* setup data segment registers */
++      xorl    %eax, %eax
++      movl    %eax, %ds
++      movl    %eax, %es
++      movl    %eax, %fs
++      movl    %eax, %gs
++      movl    %eax, %ss
++      
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%r8), %rsp
+ 
+-      /* jump to identity mapped page */
+-      addq    $(identity_mapped - relocate_kernel), %r8
+-      pushq   %r8
+-      ret
++      /* load new code segment and jump to identity mapped page */
++      movq    %r8, %rax
++      addq    $(identity_mapped - relocate_kernel), %rax
++      pushq   $(gdt_cs - gdt)
++      pushq   %rax
++      lretq
+ 
+ identity_mapped:
+       /* store the start address on the stack */
+@@ -272,5 +298,19 @@ identity_mapped:
+       xorq    %r13, %r13
+       xorq    %r14, %r14
+       xorq    %r15, %r15
+-
+       ret
++
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:
++      .quad   0x00af9a000000ffff
++gdt_end:
++
++gdt_80:
++      .word   gdt_end - gdt - 1       /* limit */
++      .quad   0                       /* base - filled in by code above */
++
++idt_80:
++      .word   0                       /* limit */
++      .quad   0                       /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-x86_64.patch        
2006-10-23 11:36:18.000000000 +0900
@@ -0,0 +1,162 @@
+--- 0006/arch/x86_64/kernel/machine_kexec.c
++++ work/arch/x86_64/kernel/machine_kexec.c    2006-10-06 15:36:16.000000000 
+0900
+@@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED;
+ static u64 kexec_pmd1[512] PAGE_ALIGNED;
+ static u64 kexec_pte1[512] PAGE_ALIGNED;
+ 
++#ifdef CONFIG_XEN
++
++/* In the case of Xen, override hypervisor functions to be able to create
++ * a regular identity mapping page table...
++ */
++
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)   ((x).pmd)
++#define x_pud_val(x)   ((x).pud)
++#define x_pgd_val(x)   ((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++      x_pmd_val(*dst) = x_pmd_val(val);
++}
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++      x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++      x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++      x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
++}
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++      x_pgd_val(*pgd) = 0;
++}
++
++#define X__PAGE_KERNEL_LARGE_EXEC \
++         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage 
*image)
++{
++      void *control_page;
++      void *table_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page) + PAGE_SIZE;
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      table_page = page_address(image->control_code_page);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
++
++      xki->page_list[PA_PGD] = __ma(kexec_pgd);
++      xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
++      xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
++      xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++      xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++      xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++      xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++}
++
++#else /* CONFIG_XEN */
++
++#define x__pmd(x) __pmd(x)
++#define x__pud(x) __pud(x)
++#define x__pgd(x) __pgd(x)
++
++#define x_set_pmd(x, y) set_pmd(x, y)
++#define x_set_pud(x, y) set_pud(x, y)
++#define x_set_pgd(x, y) set_pgd(x, y)
++
++#define x_pud_clear(x) pud_clear(x)
++#define x_pgd_clear(x) pgd_clear(x)
++
++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define X_KERNPG_TABLE _KERNPG_TABLE
++
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+       unsigned long end_addr;
+@@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve
+       addr &= PAGE_MASK;
+       end_addr = addr + PUD_SIZE;
+       while (addr < end_addr) {
+-              set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++              x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
+               addr += PMD_SIZE;
+       }
+ }
+@@ -56,12 +154,12 @@ static int init_level3_page(struct kimag
+               }
+               level2p = (pmd_t *)page_address(page);
+               init_level2_page(level2p, addr);
+-              set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++              x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
+               addr += PUD_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+-              pud_clear(level3p++);
++              x_pud_clear(level3p++);
+               addr += PUD_SIZE;
+       }
+ out:
+@@ -92,12 +190,12 @@ static int init_level4_page(struct kimag
+               if (result) {
+                       goto out;
+               }
+-              set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++              x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
+               addr += PGDIR_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+-              pgd_clear(level4p++);
++              x_pgd_clear(level4p++);
+               addr += PGDIR_SIZE;
+       }
+ out:
+@@ -108,8 +206,14 @@ out:
+ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+ {
+       pgd_t *level4p;
++      unsigned long x_end_pfn = end_pfn;
++
++#ifdef CONFIG_XEN
++      x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#endif
++
+       level4p = (pgd_t *)__va(start_pgtable);
+-      return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
++      return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT);
+ }
+ 
+ int machine_kexec_prepare(struct kimage *image)
--- 0005/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-23 11:36:17.000000000 +0900
@@ -4,6 +4,9 @@ git-2a8a3d5b65e86ec1dfef7d268c64a909eab9
 git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
 linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
 linux-2.6.19-rc1-kexec-xen-i386.patch
+git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch
+linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch
+linux-2.6.19-rc1-kexec-xen-x86_64.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- 0001/xen/arch/x86/x86_64/entry.S
+++ work/xen/arch/x86/x86_64/entry.S    2006-10-23 11:36:17.000000000 +0900
@@ -573,6 +573,7 @@ ENTRY(hypercall_table)
         .quad do_hvm_op
         .quad do_sysctl             /* 35 */
         .quad do_domctl
+        .quad do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -615,6 +616,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- 0004/xen/include/asm-x86/x86_64/elf.h
+++ work/xen/include/asm-x86/x86_64/elf.h       2006-10-23 11:36:17.000000000 
+0900
@@ -1,14 +1,51 @@
+/*
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
+ */
+
 #ifndef __X86_64_ELF_H__
 #define __X86_64_ELF_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#define ELF_NGREG 27
 
-#define ELF_NGREG 1       /* XXX: Define to be at least as large as
-                             however many register slots are needed when
-                             crash notes are written during crash dump */
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ *      Xen dosn't have threads, so fs and gs are read from the CPU and
+ *      thus values 21 and 22 are just duplicates of 25 and 26
+ *      respectively.  All these values could be passed from dom0 in the
+ *      case of it crashing, but does that help?
+ *
+ *      Lastly, I'm not sure why ds, es, fs and gs are read from
+ *      the CPU rather than regs, but linux does this
+ */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs)  do { \
+       unsigned v;                                             \
+       (pr_reg)[0] = (regs)->r15;                              \
+       (pr_reg)[1] = (regs)->r14;                              \
+       (pr_reg)[2] = (regs)->r13;                              \
+       (pr_reg)[3] = (regs)->r12;                              \
+       (pr_reg)[4] = (regs)->rbp;                              \
+       (pr_reg)[5] = (regs)->rbx;                              \
+       (pr_reg)[6] = (regs)->r11;                              \
+       (pr_reg)[7] = (regs)->r10;                              \
+       (pr_reg)[8] = (regs)->r9;                               \
+       (pr_reg)[9] = (regs)->r8;                               \
+       (pr_reg)[10] = (regs)->rax;                             \
+       (pr_reg)[11] = (regs)->rcx;                             \
+       (pr_reg)[12] = (regs)->rdx;                             \
+       (pr_reg)[13] = (regs)->rsi;                             \
+       (pr_reg)[14] = (regs)->rdi;                             \
+       (pr_reg)[16] = (regs)->rip;                     \
+       (pr_reg)[17] = (regs)->cs;                      \
+       (pr_reg)[18] = (regs)->eflags;                  \
+       (pr_reg)[19] = (regs)->rsp;                     \
+       (pr_reg)[20] = (regs)->ss;                      \
+       asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v;       \
+       asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v;       \
+       asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;       \
+       asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;       \
+       asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;       \
+       asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;       \
+} while(0);
 
 #endif /* __X86_64_ELF_H__ */
 
--- 0004/xen/include/asm-x86/x86_64/kexec.h
+++ work/xen/include/asm-x86/x86_64/kexec.h     2006-10-23 11:36:17.000000000 
+0900
@@ -1,20 +1,68 @@
+/******************************************************************************
+ * kexec.h
+ * 
+ * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1
+ *
+ */
+
 #ifndef __X86_64_KEXEC_H__
 #define __X86_64_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
+  
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 #include <xen/kexec.h>
-
+#include <asm/processor.h>
+#include <xen/string.h>
+#include <asm/fixmap.h>
+  
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static inline void crash_setup_regs(struct cpu_user_regs *newregs,
-                                    struct cpu_user_regs *oldregs)
+                            struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+       if (oldregs)
+               memcpy(newregs, oldregs, sizeof(*newregs));
+       else {
+               __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+               __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+               __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+               __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+               __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+               __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+               __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+               __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+               __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+               __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+               __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+               __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+               __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+               __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+               __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+               __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+               newregs->rip = (unsigned long)current_text_addr();
+       }
 }
 
+typedef void (*relocate_new_kernel_t)(
+                unsigned long indirection_page,
+                unsigned long page_list,
+                unsigned long start_address);
+
 static inline void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) image->page_list[1];
+    (*rnk)(image->indirection_page, (unsigned long)image->page_list, 
+           image->start_address);
 }
 
 #endif /* __X86_64_KEXEC_H__ */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel