WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 03/04] Kexec / Kdump: x86_32 specific code

To: Keir Fraser <Keir.Fraser@xxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 03/04] Kexec / Kdump: x86_32 specific code
From: Magnus Damm <magnus@xxxxxxxxxxxxx>
Date: Mon, 23 Oct 2006 18:05:40 +0900
Cc: Ian Pratt <m+Ian.Pratt@xxxxxxxxxxxx>, Kazuo Moriwaka <moriwaka@xxxxxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx, Akio Takebe <takebe_akio@xxxxxxxxxxxxxx>, magnus.damm@xxxxxxxxx, Isaku Yamahata <yamahata@xxxxxxxxxxxxx>, Magnus Damm <magnus@xxxxxxxxxxxxx>, Horms <horms@xxxxxxxxxxxx>
Delivery-date: Tue, 24 Oct 2006 08:09:51 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <20061023090515.26706.69407.sendpatchset@localhost>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <20061023090515.26706.69407.sendpatchset@localhost>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
[PATCH 03/04] Kexec / Kdump: x86_32 specific code

This patch contains the x86_32 implementation of Kexec / Kdump for Xen.

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
---

 Applies on top of xen-unstable-11856.

 buildconfigs/linux-defconfig_xen_x86_32                         |   2
 linux-2.6-xen-sparse/arch/i386/Kconfig                          |   2
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                  |   2
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c               |  25
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h               |  51 +
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h  |   8
 patches/linux-2.6.16.29/git-35...c9.patch                       | 401 +++++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-i386.patch | 169 +++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch   |  54 +
 patches/linux-2.6.16.29/series                                  |   3
 xen/arch/x86/x86_32/entry.S                                     |   2
 xen/include/asm-x86/x86_32/elf.h                                |  37
 xen/include/asm-x86/x86_32/kexec.h                              |  84 +-
 13 files changed, 817 insertions(+), 23 deletions(-)

--- 0002/buildconfigs/linux-defconfig_xen_x86_32
+++ work/buildconfigs/linux-defconfig_xen_x86_32        2006-10-23 
11:36:16.000000000 +0900
@@ -183,6 +183,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -1036,6 +1037,7 @@ CONFIG_DNOTIFY=y
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- 0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ work/linux-2.6-xen-sparse/arch/i386/Kconfig 2006-10-23 11:36:16.000000000 
+0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_XEN
+       depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2006-10-23 
11:36:16.000000000 +0900
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c      2006-10-23 
11:36:16.000000000 +0900
@@ -69,6 +69,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -943,6 +947,7 @@ static void __init parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -953,6 +958,10 @@ static void __init parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1322,9 +1331,22 @@ void __init setup_bootmem_allocator(void
        }
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end)
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
+#else
+       {
+               xen_kexec_reserve_t reservation;
+               BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve,
+                                       &reservation));
+               if (reservation.size) {
+                       crashk_res.start = reservation.start;
+                       crashk_res.end = reservation.start + 
+                               reservation.size - 1;
+               }
+       }
+#endif
 #endif
 
        if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1389,7 +1411,8 @@ legacy_init_iomem_resources(struct e820e
                        request_resource(res, data_resource);
 #endif
 #ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
+                       if (crashk_res.start != crashk_res.end)
+                            request_resource(res, &crashk_res);
 #endif
                }
        }
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h      2006-10-23 
11:36:17.000000000 +0900
@@ -0,0 +1,51 @@
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+                                       struct cpu_user_regs *xen_regs)
+{
+       xen_regs->ebx    = linux_regs->ebx;
+       xen_regs->ecx    = linux_regs->ecx;
+       xen_regs->edx    = linux_regs->edx;
+       xen_regs->esi    = linux_regs->esi;
+       xen_regs->edi    = linux_regs->edi;
+       xen_regs->ebp    = linux_regs->ebp;
+       xen_regs->eax    = linux_regs->eax;
+       xen_regs->esp    = linux_regs->esp;
+       xen_regs->ss     = linux_regs->xss;
+       xen_regs->cs     = linux_regs->xcs;
+       xen_regs->ds     = linux_regs->xds;
+       xen_regs->es     = linux_regs->xes;
+       xen_regs->eflags = linux_regs->eflags;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h 
2006-10-23 11:36:16.000000000 +0900
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+       unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch 
    2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,401 @@
+From: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+X-Git-Url: 
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9
+
+[PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+
+kexec: Avoid overwriting the current pgd (V4, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+Signed-off-by: Andi Kleen <ak@xxxxxxx>
+---
+
+--- a/arch/i386/kernel/machine_kexec.c
++++ b/arch/i386/kernel/machine_kexec.c
+@@ -21,70 +21,13 @@
+ #include <asm/system.h>
+ 
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
+-
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-      unsigned long level1_index, level2_index;
+-      u32 *pgtable_level2;
+-
+-      /* Find the current page table */
+-      pgtable_level2 = __va(read_cr3());
+-
+-      /* Find the indexes of the physical address to identity map */
+-      level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-      level2_index = address / LEVEL1_SIZE;
+-
+-      /* Identity map the page table entry */
+-      pgtable_level1[level1_index] = address | L0_ATTR;
+-      pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-      /* Flush the tlb so the new mapping takes effect.
+-       * Global tlb entries are not flushed but that is not an issue.
+-       */
+-      load_cr3(pgtable_level2);
+-}
+-
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-      unsigned long level1_index, level2_index, level3_index;
+-      u64 *pgtable_level3;
+-
+-      /* Find the current page table */
+-      pgtable_level3 = __va(read_cr3());
+-
+-      /* Find the indexes of the physical address to identity map */
+-      level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-      level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-      level3_index = address / LEVEL2_SIZE;
+-
+-      /* Identity map the page table entry */
+-      pgtable_level1[level1_index] = address | L0_ATTR;
+-      pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-      set_64bit(&pgtable_level3[level3_index],
+-                                             __pa(pgtable_level2) | L2_ATTR);
+-
+-      /* Flush the tlb so the new mapping takes effect.
+-       * Global tlb entries are not flushed but that is not an issue.
+-       */
+-      load_cr3(pgtable_level3);
+-}
++static u32 kexec_pgd[1024] PAGE_ALIGNED;
++#ifdef CONFIG_X86_PAE
++static u32 kexec_pmd0[1024] PAGE_ALIGNED;
++static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+ #endif
++static u32 kexec_pte0[1024] PAGE_ALIGNED;
++static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+ static void set_idt(void *newidt, __u16 limit)
+ {
+@@ -128,16 +71,6 @@ static void load_segments(void)
+ #undef __STR
+ }
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-                                      unsigned long indirection_page,
+-                                      unsigned long reboot_code_buffer,
+-                                      unsigned long start_address,
+-                                      unsigned int has_pae) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-extern const unsigned int relocate_new_kernel_size;
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage
+  */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+-      unsigned long page_list;
+-      unsigned long reboot_code_buffer;
+-
+-      relocate_new_kernel_t rnk;
++      unsigned long page_list[PAGES_NR];
++      void *control_page;
+ 
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+ 
+-      /* Compute some offsets */
+-      reboot_code_buffer = page_to_pfn(image->control_code_page)
+-                                                              << PAGE_SHIFT;
+-      page_list = image->head;
+-
+-      /* Set up an identity mapping for the reboot_code_buffer */
+-      identity_map_page(reboot_code_buffer);
+-
+-      /* copy it out */
+-      memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-                                              relocate_new_kernel_size);
++      control_page = page_address(image->control_code_page);
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      page_list[PA_CONTROL_PAGE] = __pa(control_page);
++      page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++      page_list[PA_PGD] = __pa(kexec_pgd);
++      page_list[VA_PGD] = (unsigned long)kexec_pgd;
++#ifdef CONFIG_X86_PAE
++      page_list[PA_PMD_0] = __pa(kexec_pmd0);
++      page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++      page_list[PA_PMD_1] = __pa(kexec_pmd1);
++      page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++#endif
++      page_list[PA_PTE_0] = __pa(kexec_pte0);
++      page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++      page_list[PA_PTE_1] = __pa(kexec_pte1);
++      page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+@@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kim
+       set_idt(phys_to_virt(0),0);
+ 
+       /* now call it */
+-      rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-      (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++      relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++                      image->start, cpu_has_pae);
+ }
+ 
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/i386/kernel/relocate_kernel.S
++++ b/arch/i386/kernel/relocate_kernel.S
+@@ -7,16 +7,138 @@
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
++
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 2)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
++
++      .text
++      .align PAGE_ALIGNED
++      .globl relocate_kernel
++relocate_kernel:
++      movl    8(%esp), %ebp /* list of pages */
++
++#ifdef CONFIG_X86_PAE
++      /* map the control page at its virtual address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xc0000000, %eax
++      shrl    $27, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PMD_0)(%ebp), %edx
++      orl     $PAE_PGD_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PMD_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x3fe00000, %eax
++      shrl    $18, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_0)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x001ff000, %eax
++      shrl    $9, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      /* identity map the control page at its physical address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xc0000000, %eax
++      shrl    $27, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PMD_1)(%ebp), %edx
++      orl     $PAE_PGD_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PMD_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x3fe00000, %eax
++      shrl    $18, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_1)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x001ff000, %eax
++      shrl    $9, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++#else
++      /* map the control page at its virtual address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xffc00000, %eax
++      shrl    $20, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_0)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_0)(%ebp), %edi
++      movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x003ff000, %eax
++      shrl    $10, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      /* identity map the control page at its physical address */
++
++      movl    PTR(VA_PGD)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0xffc00000, %eax
++      shrl    $20, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_PTE_1)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++
++      movl    PTR(VA_PTE_1)(%ebp), %edi
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
++      andl    $0x003ff000, %eax
++      shrl    $10, %eax
++      addl    %edi, %eax
++
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
++      orl     $PAGE_ATTR, %edx
++      movl    %edx, (%eax)
++#endif
+ 
+-      /*
+-       * Must be relocatable PIC code callable as a C function, that once
+-       * it starts can not use the previous processes stack.
+-       */
+-      .globl relocate_new_kernel
+ relocate_new_kernel:
+       /* read the arguments and say goodbye to the stack */
+       movl  4(%esp), %ebx /* page_list */
+-      movl  8(%esp), %ebp /* reboot_code_buffer */
++      movl  8(%esp), %ebp /* list of pages */
+       movl  12(%esp), %edx /* start address */
+       movl  16(%esp), %ecx /* cpu_has_pae */
+ 
+@@ -24,11 +146,26 @@ relocate_new_kernel:
+       pushl $0
+       popfl
+ 
+-      /* set a new stack at the bottom of our page... */
+-      lea   4096(%ebp), %esp
++      /* get physical address of control page now */
++      /* this is impossible after page table switch */
++      movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
++
++      /* switch to new set of page tables */
++      movl    PTR(PA_PGD)(%ebp), %eax
++      movl    %eax, %cr3
++
++      /* setup a new stack at the end of the physical control page */
++      lea     4096(%edi), %esp
+ 
+-      /* store the parameters back on the stack */
+-      pushl   %edx /* store the start address */
++      /* jump to identity mapped page */
++      movl    %edi, %eax
++      addl    $(identity_mapped - relocate_kernel), %eax
++      pushl   %eax
++      ret
++
++identity_mapped:
++      /* store the start address on the stack */
++      pushl   %edx
+ 
+       /* Set cr0 to a known state:
+        * 31 0 == Paging disabled
+@@ -113,8 +250,3 @@ relocate_new_kernel:
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
+-relocate_new_kernel_end:
+-
+-      .globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+-      .long relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-i386/kexec.h
++++ b/include/asm-i386/kexec.h
+@@ -1,6 +1,26 @@
+ #ifndef _I386_KEXEC_H
+ #define _I386_KEXEC_H
+ 
++#define PA_CONTROL_PAGE  0
++#define VA_CONTROL_PAGE  1
++#define PA_PGD           2
++#define VA_PGD           3
++#define PA_PTE_0         4
++#define VA_PTE_0         5
++#define PA_PTE_1         6
++#define VA_PTE_1         7
++#ifdef CONFIG_X86_PAE
++#define PA_PMD_0         8
++#define VA_PMD_0         9
++#define PA_PMD_1         10
++#define VA_PMD_1         11
++#define PAGES_NR         12
++#else
++#define PAGES_NR         8
++#endif
++
++#ifndef __ASSEMBLY__
++
+ #include <asm/fixmap.h>
+ #include <asm/ptrace.h>
+ #include <asm/string.h>
+@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru
+                newregs->eip = (unsigned long)current_text_addr();
+        }
+ }
++asmlinkage NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++              unsigned long control_page,
++              unsigned long start_address,
++              unsigned int has_pae) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
+ 
+ #endif /* _I386_KEXEC_H */
--- /dev/null
+++ 
work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
    2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,169 @@
+kexec: Move asm segment handling code to the assembly file (i386)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code 
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c   |   59 -----------------------------------------------------
+ relocate_kernel.S |   58 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 53 insertions(+), 64 deletions(-)
+
+--- 0002/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c      2006-10-05 15:49:08.000000000 
+0900
+@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-      struct Xgt_desc_struct curidt;
+-
+-      /* ia32 supports unaliged loads & stores */
+-      curidt.size    = limit;
+-      curidt.address = (unsigned long)newidt;
+-
+-      load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
+-{
+-      struct Xgt_desc_struct curgdt;
+-
+-      /* ia32 supports unaligned loads & stores */
+-      curgdt.size    = limit;
+-      curgdt.address = (unsigned long)newgdt;
+-
+-      load_gdt(&curgdt);
+-};
+-
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
+-
+-      __asm__ __volatile__ (
+-              "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-              "\t1:\n"
+-              "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-              "\tmovl %%eax,%%ds\n"
+-              "\tmovl %%eax,%%es\n"
+-              "\tmovl %%eax,%%fs\n"
+-              "\tmovl %%eax,%%gs\n"
+-              "\tmovl %%eax,%%ss\n"
+-              ::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim
+       page_list[PA_PTE_1] = __pa(kexec_pte1);
+       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+-      /* The segment registers are funny things, they have both a
+-       * visible and an invisible part.  Whenever the visible part is
+-       * set to a specific selector, the invisible part is loaded
+-       * with from a table in memory.  At no other time is the
+-       * descriptor table in memory accessed.
+-       *
+-       * I take advantage of this here by force loading the
+-       * segments, before I zap the gdt with an invalid value.
+-       */
+-      load_segments();
+-      /* The gdt & idt are now invalid.
+-       * If you want to load them you must set up your own idt & gdt.
+-       */
+-      set_gdt(phys_to_virt(0),0);
+-      set_idt(phys_to_virt(0),0);
+-
+-      /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start, cpu_has_pae);
+ }
+--- 0002/arch/i386/kernel/relocate_kernel.S
++++ work/arch/i386/kernel/relocate_kernel.S    2006-10-05 16:03:21.000000000 
+0900
+@@ -154,14 +154,45 @@ relocate_new_kernel:
+       movl    PTR(PA_PGD)(%ebp), %eax
+       movl    %eax, %cr3
+ 
++      /* setup idt */
++      movl    %edi, %eax
++      addl    $(idt_48 - relocate_kernel), %eax
++      lidtl   (%eax)
++
++      /* setup gdt */
++      movl    %edi, %eax
++      addl    $(gdt - relocate_kernel), %eax
++      movl    %edi, %esi
++      addl    $((gdt_48 - relocate_kernel) + 2), %esi
++      movl    %eax, (%esi)
++      
++      movl    %edi, %eax
++      addl    $(gdt_48 - relocate_kernel), %eax
++      lgdtl   (%eax)
++
++      /* setup data segment registers */
++      mov     $(gdt_ds - gdt), %eax
++      mov     %eax, %ds
++      mov     %eax, %es
++      mov     %eax, %fs
++      mov     %eax, %gs
++      mov     %eax, %ss
++      
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%edi), %esp
+ 
+-      /* jump to identity mapped page */
+-      movl    %edi, %eax
+-      addl    $(identity_mapped - relocate_kernel), %eax
+-      pushl   %eax
+-      ret
++      /* load new code segment and jump to identity mapped page */
++      movl    %edi, %esi
++      xorl    %eax, %eax
++      pushl   %eax
++      pushl   %esi
++      pushl   %eax
++      movl    $(gdt_cs - gdt), %eax
++      pushl   %eax    
++      movl    %edi, %eax
++      addl    $(identity_mapped - relocate_kernel),%eax
++      pushl   %eax
++      iretl
+ 
+ identity_mapped:
+       /* store the start address on the stack */
+@@ -250,3 +281,20 @@ identity_mapped:
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
++
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:       
++      .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
++gdt_ds:
++      .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
++gdt_end:
++      
++gdt_48:
++      .word   gdt_end - gdt - 1       /* limit */
++      .long   0                       /* base - filled in by code above */
++
++idt_48:
++      .word   0                       /* limit */
++      .long   0                       /* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch  
2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,54 @@
+--- 0004/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c      2006-10-11 18:34:06.000000000 
+0900
+@@ -20,6 +20,10 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+ static u32 kexec_pgd[1024] PAGE_ALIGNED;
+ #ifdef CONFIG_X86_PAE
+@@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
++#ifdef CONFIG_XEN
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage 
*image)
++{
++      void *control_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page);
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_PGD] = __ma(kexec_pgd);
++#ifdef CONFIG_X86_PAE
++      xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++      xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++#endif
++      xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++      xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++
++}
++
++#endif /* CONFIG_XEN */
++
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
--- 0004/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series 2006-10-23 11:36:16.000000000 +0900
@@ -1,6 +1,9 @@
 kexec-generic.patch
 git-2efe55a9cec8418f0e0cde3dc3787a42fddc4411.patch
 git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch
+git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
+linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
+linux-2.6.19-rc1-kexec-xen-i386.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- 0001/xen/arch/x86/x86_32/entry.S
+++ work/xen/arch/x86/x86_32/entry.S    2006-10-23 11:36:16.000000000 +0900
@@ -672,6 +672,7 @@ ENTRY(hypercall_table)
         .long do_hvm_op
         .long do_sysctl             /* 35 */
         .long do_domctl
+        .long do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -714,6 +715,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 2 /* do_kexec_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- 0004/xen/include/asm-x86/x86_32/elf.h
+++ work/xen/include/asm-x86/x86_32/elf.h       2006-10-23 11:36:17.000000000 
+0900
@@ -1,14 +1,39 @@
+/*
+ * Based heavily on include/asm-i386/elf.h and
+ * include/asm-i386/system.h from Linux 2.6.16
+ */
+
 #ifndef __X86_32_ELF_H__
 #define __X86_32_ELF_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#define ELF_NGREG 17
 
-#define ELF_NGREG 1       /* XXX: Define to be at least as large as
-                             however many register slots are needed when
-                             crash notes are written during crash dump */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_32_ELF_H__ */
 
--- 0004/xen/include/asm-x86/x86_32/kexec.h
+++ work/xen/include/asm-x86/x86_32/kexec.h     2006-10-23 11:36:17.000000000 
+0900
@@ -1,36 +1,92 @@
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
+/******************************************************************************
+ * kexec.h
+ * 
+ * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1
+ *
+ */
+  
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
 #include <xen/types.h>
-#include <public/xen.h>
 #include <xen/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static inline void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-                                      struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
-
+  
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static inline void crash_setup_regs(struct cpu_user_regs *newregs,
-                                    struct cpu_user_regs *oldregs)
+                            struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
+typedef asmlinkage void (*relocate_new_kernel_t)(
+               unsigned long indirection_page,
+               unsigned long page_list,
+               unsigned long start_address,
+               unsigned int has_pae);
+
 static inline void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) image->page_list[1];
+    (*rnk)(image->indirection_page, (unsigned long)image->page_list, 
+           image->start_address, (unsigned long)cpu_has_pae);
 }
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel