WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [RFC][PATCH] xen: Kexec patch for pvops kernel

To: konrad.wilk@xxxxxxxxxx, ian.campbell@xxxxxxxxxx, vgoyal@xxxxxxxxxx, xen-devel@xxxxxxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx
Subject: [Xen-devel] [RFC][PATCH] xen: Kexec patch for pvops kernel
From: Daniel Kiper <dkiper@xxxxxxxxxxxx>
Date: Mon, 22 Aug 2011 18:23:16 +0200
Cc:
Delivery-date: Mon, 22 Aug 2011 09:24:24 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.3.28i
Hi,

I am posting first kexec patch for pvops kernel. It applies to
git://oss.oracle.com/git/kwilk/xen.git tree, stable/2.6.39.x branch.
Tested on x86_64. Compiles for x86_32. It should be used with
latest kexec-tools development version which could be found at
git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git.

TODO:
  - it should work on bare metal and Xen hypervisor
    (now this future is broken; kexec/kdump works
    only on Xen hypervisor),
  - move Xen code from generic and arch source files
    to Xen specific files,
  - reuse available generic Linux Kernel code
    as much as possible.

It is WIP and I am looking for comments only.
It is not final version.

Daniel

 arch/x86/include/asm/kexec.h         |   16 ++
 arch/x86/include/asm/xen/hypercall.h |    6 +
 arch/x86/kernel/machine_kexec_32.c   |  118 ++++++++--------
 arch/x86/kernel/machine_kexec_64.c   |  192 +++++++++++++++++---------
 arch/x86/kernel/relocate_kernel_32.S |   39 +++++-
 arch/x86/kernel/relocate_kernel_64.S |   36 +++++-
 arch/x86/kernel/setup.c              |    5 +-
 arch/x86/xen/enlighten.c             |   11 ++-
 drivers/base/cpu.c                   |    4 +-
 drivers/xen/Makefile                 |    1 +
 drivers/xen/machine_kexec.c          |  256 ++++++++++++++++++++++++++++++++++
 drivers/xen/sys-hypervisor.c         |   40 ++++++
 drivers/xen/xenbus/xenbus_probe.c    |   98 +++++++++++++
 include/linux/kexec.h                |   13 ++
 include/xen/interface/kexec.h        |  158 +++++++++++++++++++++
 include/xen/interface/xen.h          |    1 +
 kernel/kexec.c                       |   93 ++++++++++--
 17 files changed, 939 insertions(+), 148 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..578697e 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,14 +5,30 @@
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
 # define PA_PGD                        2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE          3
 # define PAGES_NR              4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+#  define VA_PGD               3
+ */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
 #else
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
 # define PA_TABLE_PAGE         2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE          3
 # define PAGES_NR              4
+# else /* CONFIG_XEN, see comment above
+#  define VA_TABLE_PAGE                3 */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
 #endif
 
 # define KEXEC_CONTROL_CODE_MAX_SIZE   2048
diff --git a/arch/x86/include/asm/xen/hypercall.h 
b/arch/x86/include/asm/xen/hypercall.h
index 18882f7..2db0222 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -468,6 +468,12 @@ HYPERVISOR_xenoprof_op(unsigned int op, void *arg)
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int __must_check
+HYPERVISOR_kexec_op(unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/arch/x86/kernel/machine_kexec_32.c 
b/arch/x86/kernel/machine_kexec_32.c
index a3fa43b..14b7fa8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -27,47 +27,13 @@
 #include <asm/cacheflush.h>
 #include <asm/debugreg.h>
 
-static void set_idt(void *newidt, __u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* ia32 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       load_idt(&curidt);
-}
-
+#ifdef CONFIG_XEN
+#include <xen/xen-ops.h>
 
-static void set_gdt(void *newgdt, __u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* ia32 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
+#include <xen/interface/kexec.h>
 
-       load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-       __asm__ __volatile__ (
-               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
-               "\t1:\n"
-               "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-               "\tmovl %%eax,%%ds\n"
-               "\tmovl %%eax,%%es\n"
-               "\tmovl %%eax,%%fs\n"
-               "\tmovl %%eax,%%gs\n"
-               "\tmovl %%eax,%%ss\n"
-               : : : "eax", "memory");
-#undef STR
-#undef __STR
-}
+#include <asm/xen/page.h>
+#endif
 
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
@@ -84,6 +50,15 @@ static int machine_kexec_alloc_page_tables(struct kimage 
*image)
 {
        image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 #ifdef CONFIG_X86_PAE
+#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
+       if (image->arch.pgd) {
+               if 
(xen_create_contiguous_region(native_pgd_val(*image->arch.pgd), 0, 
BITS_PER_LONG) < 0) {
+                       __free_page(virt_to_page(image->arch.pgd));
+                       image->arch.pgd = NULL;
+                       return -ENOMEM;
+               }
+       }
+#endif
        image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
        image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 #endif
@@ -139,6 +114,51 @@ static void machine_kexec_prepare_page_tables(struct 
kimage *image)
                __pa(control_page), __pa(control_page));
 }
 
+#ifdef CONFIG_XEN
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page);
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_PGD] = __ma(image->arch.pgd);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus)
+{
+       int k;
+
+       /* The per-cpu crash note resources belong to the hypervisor resource */
+       for (k = 0; k < nr_phys_cpus; k++)
+               request_resource(hypervisor, phys_cpus + k);
+
+       return 0;
+}
+
+void machine_kexec_register_resources(struct resource *res) { ; }
+
+#endif /* CONFIG_XEN */
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -176,6 +196,7 @@ void machine_kexec_cleanup(struct kimage *image)
        machine_kexec_free_page_tables(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -228,24 +249,6 @@ void machine_kexec(struct kimage *image)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);
 
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
        /* now call it */
        image->start = relocate_kernel_ptr((unsigned long)image->head,
                                           (unsigned long)page_list,
@@ -259,6 +262,7 @@ void machine_kexec(struct kimage *image)
 
        __ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db..c7623a4 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -21,6 +21,115 @@
 #include <asm/mmu_context.h>
 #include <asm/debugreg.h>
 
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x)   ((x).pmd)
+#define x_pud_val(x)   ((x).pud)
+#define x_pgd_val(x)   ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+       x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+       x_pud_val(*dst) = phys_to_machine(XPADDR(x_pud_val(val))).maddr;
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+       x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+       x_pgd_val(*dst) = phys_to_machine(XPADDR(x_pgd_val(val))).maddr;
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+       x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+       void *table_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page) + PAGE_SIZE;
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       table_page = page_address(image->control_code_page);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus)
+{
+       int k;
+
+       /* The per-cpu crash note resources belong to the hypervisor resource */
+       for (k = 0; k < nr_phys_cpus; k++)
+               request_resource(hypervisor, phys_cpus + k);
+
+       return 0;
+}
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                unsigned long addr)
 {
@@ -50,7 +159,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t 
*pgd,
        }
        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd))
-               set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
        result = 0;
 out:
        return result;
@@ -63,7 +172,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long 
addr)
        addr &= PAGE_MASK;
        end_addr = addr + PUD_SIZE;
        while (addr < end_addr) {
-               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
                addr += PMD_SIZE;
        }
 }
@@ -88,12 +197,12 @@ static int init_level3_page(struct kimage *image, pud_t 
*level3p,
                }
                level2p = (pmd_t *)page_address(page);
                init_level2_page(level2p, addr);
-               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+               x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
                addr += PUD_SIZE;
        }
        /* clear the unused entries */
        while (addr < end_addr) {
-               pud_clear(level3p++);
+               x_pud_clear(level3p++);
                addr += PUD_SIZE;
        }
 out:
@@ -123,12 +232,12 @@ static int init_level4_page(struct kimage *image, pgd_t 
*level4p,
                result = init_level3_page(image, level3p, addr, last_addr);
                if (result)
                        goto out;
-               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+               x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
                addr += PGDIR_SIZE;
        }
        /* clear the unused entries */
        while (addr < end_addr) {
-               pgd_clear(level4p++);
+               x_pgd_clear(level4p++);
                addr += PGDIR_SIZE;
        }
 out:
@@ -189,8 +298,14 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
 {
        pgd_t *level4p;
        int result;
+       unsigned long x_max_pfn = max_pfn;
+
+#ifdef CONFIG_XEN
+       x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
+
        level4p = (pgd_t *)__va(start_pgtable);
-       result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+       result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
        if (result)
                return result;
        /*
@@ -203,47 +318,6 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
        return init_transition_pgtable(image, level4p);
 }
 
-static void set_idt(void *newidt, u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* x86-64 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       __asm__ __volatile__ (
-               "lidtq %0\n"
-               : : "m" (curidt)
-               );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* x86-64 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       __asm__ __volatile__ (
-               "lgdtq %0\n"
-               : : "m" (curgdt)
-               );
-};
-
-static void load_segments(void)
-{
-       __asm__ __volatile__ (
-               "\tmovl %0,%%ds\n"
-               "\tmovl %0,%%es\n"
-               "\tmovl %0,%%ss\n"
-               "\tmovl %0,%%fs\n"
-               "\tmovl %0,%%gs\n"
-               : : "a" (__KERNEL_DS) : "memory"
-               );
-}
-
 int machine_kexec_prepare(struct kimage *image)
 {
        unsigned long start_pgtable;
@@ -265,6 +339,7 @@ void machine_kexec_cleanup(struct kimage *image)
        free_transition_pgtable(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -311,24 +386,6 @@ void machine_kexec(struct kimage *image)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);
 
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
        /* now call it */
        image->start = relocate_kernel((unsigned long)image->head,
                                       (unsigned long)page_list,
@@ -342,10 +399,13 @@ void machine_kexec(struct kimage *image)
 
        __ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
        VMCOREINFO_SYMBOL(phys_base);
+#endif
        VMCOREINFO_SYMBOL(init_level4_pgt);
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/kernel/relocate_kernel_32.S 
b/arch/x86/kernel/relocate_kernel_32.S
index 4123553..fe0fbfb 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -87,14 +87,32 @@ relocate_kernel:
        movl    PTR(PA_PGD)(%ebp), %eax
        movl    %eax, %cr3
 
+       /* setup idt */
+       lidtl   idt_48 - relocate_kernel(%edi)
+
+       /* setup gdt */
+       leal    gdt - relocate_kernel(%edi), %eax
+       movl    %eax, (gdt_48 - relocate_kernel) + 2(%edi)
+       lgdtl   gdt_48 - relocate_kernel(%edi)
+
+       /* setup data segment registers */
+       mov     $(gdt_ds - gdt), %eax
+       mov     %eax, %ds
+       mov     %eax, %es
+       mov     %eax, %fs
+       mov     %eax, %gs
+       mov     %eax, %ss
+
        /* setup a new stack at the end of the physical control page */
        lea     PAGE_SIZE(%edi), %esp
 
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
+       pushl   $0
+       pushl   $(gdt_cs - gdt)
        movl    %edi, %eax
        addl    $(identity_mapped - relocate_kernel), %eax
        pushl   %eax
-       ret
+       iretl
 
 identity_mapped:
        /* store the start address on the stack */
@@ -271,5 +289,22 @@ swap_pages:
        popl    %ebp
        ret
 
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
+gdt_ds:
+       .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+       .word   gdt_end - gdt - 1       /* limit */
+       .long   0                       /* base - filled in by code above */
+
+idt_48:
+       .word   0                       /* limit */
+       .long   0                       /* base */
+
        .globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S 
b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b..bb0455d 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -91,13 +91,30 @@ relocate_kernel:
        /* Switch to the identity mapped page tables */
        movq    %r9, %cr3
 
+       /* setup idt */
+       lidtq   idt_80 - relocate_kernel(%r8)
+
+       /* setup gdt */
+       leaq    gdt - relocate_kernel(%r8), %rax
+       movq    %rax, (gdt_80 - relocate_kernel) + 2(%r8)
+       lgdtq   gdt_80 - relocate_kernel(%r8)
+
+       /* setup data segment registers */
+       xorl    %eax, %eax
+       movl    %eax, %ds
+       movl    %eax, %es
+       movl    %eax, %fs
+       movl    %eax, %gs
+       movl    %eax, %ss
+
        /* setup a new stack at the end of the physical control page */
        lea     PAGE_SIZE(%r8), %rsp
 
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
        addq    $(identity_mapped - relocate_kernel), %r8
+       pushq   $(gdt_cs - gdt)
        pushq   %r8
-       ret
+       lretq
 
 identity_mapped:
        /* store the start address on the stack */
@@ -262,5 +279,20 @@ swap_pages:
 3:
        ret
 
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+       .word   gdt_end - gdt - 1       /* limit */
+       .quad   0                       /* base - filled in by code above */
+
+idt_80:
+       .word   0                       /* limit */
+       .quad   0                       /* base */
+
        .globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c6724e4..b978d7e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -509,7 +509,7 @@ static void __init 
memblock_x86_reserve_range_setup_data(void)
  */
 
 #ifdef CONFIG_KEXEC
-
+#ifndef CONFIG_XEN
 static inline unsigned long long get_total_mem(void)
 {
        unsigned long long total;
@@ -581,6 +581,9 @@ static void __init reserve_crashkernel(void)
        insert_resource(&iomem_resource, &crashk_res);
 }
 #else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
 static void __init reserve_crashkernel(void)
 {
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8a8a156..b504d0e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1030,7 +1030,9 @@ static void xen_emergency_restart(void)
 
 static void xen_machine_halt(void)
 {
+#ifndef CONFIG_KEXEC
        xen_reboot(SHUTDOWN_poweroff);
+#endif
 }
 
 static void xen_machine_power_off(void)
@@ -1040,10 +1042,13 @@ static void xen_machine_power_off(void)
        xen_reboot(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
-       xen_reboot(SHUTDOWN_crash);
+       /* The kernel is broken so disable interrupts */
+       local_irq_disable();
 }
+#endif
 
 static int
 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -1067,8 +1072,10 @@ static const struct machine_ops xen_machine_ops 
__initconst = {
        .halt = xen_machine_halt,
        .power_off = xen_machine_power_off,
        .shutdown = xen_machine_halt,
-       .crash_shutdown = xen_crash_shutdown,
        .emergency_restart = xen_emergency_restart,
+#ifdef CONFIG_KEXEC
+       .crash_shutdown = xen_crash_shutdown
+#endif
 };
 
 /*
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea..24d71fd 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -106,7 +106,7 @@ static inline void register_cpu_control(struct cpu *cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
 #include <linux/kexec.h>
 
 static ssize_t show_crash_notes(struct sys_device *dev, struct 
sysdev_attribute *attr,
@@ -231,7 +231,7 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
        if (!error)
                register_cpu_under_node(num, cpu_to_node(num));
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
        if (!error)
                error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
 #endif
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index f1d5622..c0451cd 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
 obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)                 += pci.o
 obj-$(CONFIG_XEN_TMEM)         += tmem.o
+obj-$(CONFIG_KEXEC)                    += machine_kexec.o
 
 xen-evtchn-y                           := evtchn.o
 xen-gntdev-y                           := gntdev.o
diff --git a/drivers/xen/machine_kexec.c b/drivers/xen/machine_kexec.c
new file mode 100644
index 0000000..8cd20e4
--- /dev/null
+++ b/drivers/xen/machine_kexec.c
@@ -0,0 +1,256 @@
+/*
+ * Handle transition of Linux booting another kernel.
+ */
+
+#include <linux/kexec.h>
+#include <linux/reboot.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+
+#include <xen/xen-ops.h>
+
+#include <xen/interface/kexec.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+                                        struct kimage *image);
+extern int machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus);
+extern void machine_kexec_register_resources(struct resource *res);
+
+static int __initdata xen_max_nr_phys_cpus;
+static struct resource xen_hypervisor_res;
+#if 0
+static struct resource *xen_phys_cpus;
+#endif
+static struct resource xen_phys_cpus[16];
+
+size_t vmcoreinfo_size_xen;
+unsigned long paddr_vmcoreinfo_xen;
+
+void __init xen_machine_kexec_setup_resources(void)
+{
+       xen_kexec_range_t range;
+       struct resource *res;
+       int k = 0;
+       int rc;
+
+       if (strstr(boot_command_line, "crashkernel="))
+               printk(KERN_WARNING "Ignoring crashkernel command line, "
+                      "parameter will be supplied by xen\n");
+
+       if (!xen_initial_domain())
+               return;
+
+       /* determine maximum number of physical cpus */
+
+       while (1) {
+               memset(&range, 0, sizeof(range));
+               range.range = KEXEC_RANGE_MA_CPU;
+               range.nr = k;
+
+               if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+                       break;
+
+               k++;
+       }
+
+       if (k == 0)
+               return;
+
+       xen_max_nr_phys_cpus = k;
+
+#if 0
+       /* allocate xen_phys_cpus */
+
+       xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
+#endif
+
+       /* fill in xen_phys_cpus with per-cpu crash note information */
+
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               memset(&range, 0, sizeof(range));
+               range.range = KEXEC_RANGE_MA_CPU;
+               range.nr = k;
+
+               if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+                       goto err;
+
+               res = xen_phys_cpus + k;
+
+               memset(res, 0, sizeof(*res));
+               res->name = "Crash note";
+               res->start = range.start;
+               res->end = range.start + range.size - 1;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       }
+
+       /* fill in xen_hypervisor_res with hypervisor machine address range */
+
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_XEN;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+               goto err;
+
+       xen_hypervisor_res.name = "Hypervisor code and data";
+       xen_hypervisor_res.start = range.start;
+       xen_hypervisor_res.end = range.start + range.size - 1;
+       xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+#ifdef CONFIG_X86
+       insert_resource(&iomem_resource, &xen_hypervisor_res);
+#endif
+
+       /* fill in crashk_res if range is reserved by hypervisor */
+
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_CRASH;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+               goto err;
+
+       if (range.size) {
+               crashk_res.start = range.start;
+               crashk_res.end = range.start + range.size - 1;
+#ifdef CONFIG_X86
+               insert_resource(&iomem_resource, &crashk_res);
+#endif
+       }
+
+       /* get physical address of vmcoreinfo */
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_VMCOREINFO;
+
+       rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+       if (rc == 0) {
+               /* Hypercall succeeded */
+               vmcoreinfo_size_xen = range.size;
+               paddr_vmcoreinfo_xen = range.start;
+
+       } else {
+               /* Hypercall failed.
+                * Indicate not to create sysfs file by resetting globals
+                */
+               vmcoreinfo_size_xen = 0;
+               paddr_vmcoreinfo_xen = 0;
+               
+               /* The KEXEC_CMD_kexec_get_range hypercall did not implement
+                * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3.
+                * Do not bail out if it fails for this reason.
+                */
+               if (rc != -EINVAL)
+                       return;
+       }
+
+       if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
+                                         xen_max_nr_phys_cpus))
+               goto err;
+
+#ifdef CONFIG_X86
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               res = xen_phys_cpus + k;
+               if (!res->parent) /* outside of xen_hypervisor_res range */
+                       insert_resource(&iomem_resource, res);
+       }
+
+       if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
+                                        get_order(sizeof(vmcoreinfo_note)),
+                                        BITS_PER_LONG))
+               goto err;
+#endif
+
+       return;
+
+ err:
+       /*
+        * It isn't possible to free xen_phys_cpus this early in the
+        * boot. Failure at this stage is unexpected and the amount of
+        * memory is small therefore we tolerate the potential leak.
+         */
+       xen_max_nr_phys_cpus = 0;
+       return;
+}
+
+#ifndef CONFIG_X86
+void __init xen_machine_kexec_register_resources(struct resource *res)
+{
+       int k;
+       struct resource *r;
+
+       request_resource(res, &xen_hypervisor_res);
+       for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+               r = xen_phys_cpus + k;
+               if (r->parent == NULL) /* out of xen_hypervisor_res range */
+                       request_resource(res, r);
+       } 
+       machine_kexec_register_resources(res);
+}
+#endif
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       machine_kexec_setup_load_arg(xki, image);
+
+       xki->indirection_page = image->head;
+       xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       setup_load_arg(&xkl.image, image);
+       return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+       xen_kexec_exec_t xke;
+
+       memset(&xke, 0, sizeof(xke));
+       xke.type = image->type;
+       (void)HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke);
+       panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+#ifdef CONFIG_X86
+unsigned long paddr_vmcoreinfo_note(void)
+{
+       return virt_to_machine(&vmcoreinfo_note).maddr;
+}
+#endif
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1e0fe01..0dc4f51 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -355,6 +355,31 @@ static void xen_properties_destroy(void)
        sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+#ifdef CONFIG_KEXEC
+
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+       return sprintf(page, "%lx %zx\n",
+               paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+#endif
+
 static int __init hyper_sysfs_init(void)
 {
        int ret;
@@ -377,9 +402,20 @@ static int __init hyper_sysfs_init(void)
        ret = xen_properties_init();
        if (ret)
                goto prop_out;
+#ifdef CONFIG_KEXEC
+       if (vmcoreinfo_size_xen) {
+               ret = xen_sysfs_vmcoreinfo_init();
+               if (ret)
+                       goto vmcoreinfo_out;
+       }
+#endif
 
        goto out;
 
+#ifdef CONFIG_KEXEC
+vmcoreinfo_out:
+#endif
+       xen_properties_destroy();
 prop_out:
        xen_sysfs_uuid_destroy();
 uuid_out:
@@ -394,6 +430,10 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+#ifdef CONFIG_KEXEC
+       if (vmcoreinfo_size_xen)
+               xen_sysfs_vmcoreinfo_destroy();
+#endif
        xen_properties_destroy();
        xen_compilation_destroy();
        xen_sysfs_uuid_destroy();
diff --git a/drivers/xen/xenbus/xenbus_probe.c 
b/drivers/xen/xenbus/xenbus_probe.c
index 7397695..4ffe83c 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -673,8 +673,106 @@ void unregister_xenstore_notifier(struct notifier_block 
*nb)
 }
 EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
 
+#ifdef CONFIG_CRASH_DUMP
+static DECLARE_WAIT_QUEUE_HEAD(be_state_wq);
+static int be_state;
+
+static void xenbus_reset_state_changed(struct xenbus_watch *w, const char **v, 
unsigned int l)
+{
+       xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &be_state);
+       printk(KERN_INFO "XENBUS: %s %s\n", v[XS_WATCH_PATH], 
xenbus_strstate(be_state));
+       wake_up(&be_state_wq);
+}
+
+static int xenbus_reset_check_final(int *st)
+{
+       return *st == XenbusStateInitialising || *st == XenbusStateInitWait;
+}
+
+static void xenbus_reset_frontend_state(char *backend, char *frontend)
+{
+       struct xenbus_watch watch;
+
+       memset(&watch, 0, sizeof(watch));
+       watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", backend);
+       if (!watch.node)
+               return;
+
+       watch.callback = xenbus_reset_state_changed;
+       be_state = XenbusStateUnknown;
+
+       printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", backend);
+       register_xenbus_watch(&watch);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosing);
+       wait_event_interruptible(be_state_wq, be_state == XenbusStateClosing);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosed);
+       wait_event_interruptible(be_state_wq, be_state == XenbusStateClosed);
+
+       xenbus_printf(XBT_NIL, frontend, "state", "%d", 
XenbusStateInitialising);
+       wait_event_interruptible(be_state_wq, 
xenbus_reset_check_final(&be_state));
+
+       unregister_xenbus_watch(&watch);
+       printk(KERN_INFO "XENBUS: reconnect done on %s\n", backend);
+       kfree(watch.node);
+}
+
+static void xenbus_reset_check_state(char *class, char *dev)
+{
+       int state, err;
+       char *backend, *frontend;
+
+       frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+       if (!frontend)
+               return;
+
+       err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &state);
+       /* frontend connected? */
+       if (err == 1 && state == XenbusStateConnected) {
+               backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+               if (!backend || IS_ERR(backend))
+                       goto out;
+               err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &state);
+               /* backend connected? */
+               if (err == 1 && state == XenbusStateConnected)
+                       xenbus_reset_frontend_state(backend, frontend);
+               kfree(backend);
+       }
+out:
+       kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+       char **devclass, **dev;
+       int devclass_n, dev_n;
+       int i, j;
+
+       devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+       if (IS_ERR(devclass))
+               return;
+
+       for (i = 0; i < devclass_n; i++) {
+               dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+               if (IS_ERR(dev))
+                       continue;
+               for (j = 0; j < dev_n; j++)
+                       xenbus_reset_check_state(devclass[i], dev[j]);
+               kfree(dev);
+       }
+       kfree(devclass);
+}
+#endif
+
 void xenbus_probe(struct work_struct *unused)
 {
+#ifdef CONFIG_CRASH_DUMP
+       /* reset devices in XenbusStateConnected state */
+       if (reset_devices)
+               xenbus_reset_state();
+#endif
+
        xenstored_ready = 1;
 
        /* Notify others that xenstore is up */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index c2478a3..15565c6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -112,6 +112,12 @@ struct kimage {
 extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
 extern asmlinkage long sys_kexec_load(unsigned long entry,
                                        unsigned long nr_segments,
                                        struct kexec_segment __user *segments,
@@ -192,8 +198,15 @@ extern struct kimage *kexec_crash_image;
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
 #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#if !defined(CONFIG_XEN) || !defined(CONFIG_X86)
 #define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES 
\
                                    + VMCOREINFO_NOTE_NAME_BYTES)
+#else
+#define VMCOREINFO_NOTE_SIZE       ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \
+                                        + VMCOREINFO_BYTES \
+                                        + VMCOREINFO_NOTE_NAME_BYTES, \
+                                        PAGE_SIZE)
+#endif
 
 /* Location of a reserved region to hold the crash kernel.
  */
diff --git a/include/xen/interface/kexec.h b/include/xen/interface/kexec.h
new file mode 100644
index 0000000..5fd0495
--- /dev/null
+++ b/include/xen/interface/kexec.h
@@ -0,0 +1,158 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@xxxxxxxxxxxx>
+ * - Magnus Damm <magnus@xxxxxxxxxxxxx>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine 
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot,  both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ *    This is used by the dom0 kernel to ask the hypervisor about various 
+ *    address information. This information is needed to allow kexec-tools 
+ *    to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ *    There are no big surprises here, the kexec binary from kexec-tools
+ *    runs in userspace in dom0. The tool loads/unloads data into the
+ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
+ *    loaded the dom0 kernel performs a load hypercall operation, and
+ *    before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ *    This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, void *args)
+ * @cmd  == KEXEC_CMD_... 
+ *          KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ *   - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ *   - KEXEC_TYPE_CRASH is used to specify this type
+ *   - parts of our system may be broken at kexec-on-panic time
+ *     - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+ 
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+#if defined(__ia64__)
+    unsigned long reboot_code_buffer;
+#endif
+    unsigned long indirection_page;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec                 0
+typedef struct xen_kexec_exec {
+    int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load            1
+#define KEXEC_CMD_kexec_unload          2
+typedef struct xen_kexec_load {
+    int type;
+    xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area 
*/
+#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself 
*/
+#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note 
*/
+#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
+                                     * Note that although this is adjacent
+                                     * to Xen it exists in a separate EFI
+                                     * region on ia64, and thus needs to be
+                                     * inserted into iomem_machine separately 
*/
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+                                     * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+                                     * of the EFI Memory Map */
+#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo 
*/
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size  == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range       3
+typedef struct xen_kexec_range {
+    int range;
+    int nr;
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 9f2d370..2e23363 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de..b92fdf0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -35,18 +35,26 @@
 #include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 
+#include <xen/xen-ops.h>
+
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/sections.h>
 
+#include <asm/xen/page.h>
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+u32 __page_aligned_bss vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+#else
 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+#endif
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
@@ -357,13 +365,26 @@ static int kimage_is_destination_range(struct kimage 
*image,
        return 0;
 }
 
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, 
unsigned long limit)
 {
        struct page *pages;
 
        pages = alloc_pages(gfp_mask, order);
        if (pages) {
                unsigned int count, i;
+#ifdef CONFIG_XEN
+               int address_bits;
+
+               if (limit == ~0UL)
+                       address_bits = BITS_PER_LONG;
+               else
+                       address_bits = ilog2(limit);
+
+               if (xen_create_contiguous_region((unsigned 
long)page_address(pages), order, address_bits) < 0) {
+                       __free_pages(pages, order);
+                       return NULL;
+               }
+#endif
                pages->mapping = NULL;
                set_page_private(pages, order);
                count = 1 << order;
@@ -427,10 +448,10 @@ static struct page 
*kimage_alloc_normal_control_pages(struct kimage *image,
        do {
                unsigned long pfn, epfn, addr, eaddr;
 
-               pages = kimage_alloc_pages(GFP_KERNEL, order);
+               pages = kimage_alloc_pages(GFP_KERNEL, order, 
KEXEC_CONTROL_MEMORY_LIMIT);
                if (!pages)
                        break;
-               pfn   = page_to_pfn(pages);
+               pfn   = pfn_to_mfn(page_to_pfn(pages));
                epfn  = pfn + count;
                addr  = pfn << PAGE_SHIFT;
                eaddr = epfn << PAGE_SHIFT;
@@ -464,6 +485,7 @@ static struct page 
*kimage_alloc_normal_control_pages(struct kimage *image,
        return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                      unsigned int order)
 {
@@ -517,7 +539,7 @@ static struct page *kimage_alloc_crash_control_pages(struct 
kimage *image,
                }
                /* If I don't overlap any segments I have found my hole! */
                if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       pages = pfn_to_page(mfn_to_pfn(hole_start >> 
PAGE_SHIFT));
                        break;
                }
        }
@@ -544,6 +566,13 @@ struct page *kimage_alloc_control_pages(struct kimage 
*image,
 
        return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -559,7 +588,7 @@ static int kimage_add_entry(struct kimage *image, 
kimage_entry_t entry)
                        return -ENOMEM;
 
                ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               *image->entry = virt_to_machine(ind_page).maddr | 
IND_INDIRECTION;
                image->entry = ind_page;
                image->last_entry = ind_page +
                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -618,13 +647,13 @@ static void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                ptr = (entry & IND_INDIRECTION)? \
-                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                       phys_to_virt(machine_to_phys(XMADDR(entry & 
PAGE_MASK)).paddr): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
        struct page *page;
 
-       page = pfn_to_page(entry >> PAGE_SHIFT);
+       page = pfn_to_page(mfn_to_pfn(entry >> PAGE_SHIFT));
        kimage_free_pages(page);
 }
 
@@ -636,6 +665,10 @@ static void kimage_free(struct kimage *image)
        if (!image)
                return;
 
+#ifdef CONFIG_XEN
+       xen_machine_kexec_unload(image);
+#endif
+
        kimage_free_extra_pages(image);
        for_each_kimage_entry(image, ptr, entry) {
                if (entry & IND_INDIRECTION) {
@@ -711,7 +744,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
         * have a match.
         */
        list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
                if (addr == destination) {
                        list_del(&page->lru);
                        return page;
@@ -722,16 +755,16 @@ static struct page *kimage_alloc_page(struct kimage 
*image,
                kimage_entry_t *old;
 
                /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
+               page = kimage_alloc_pages(gfp_mask, 0, 
KEXEC_SOURCE_MEMORY_LIMIT);
                if (!page)
                        return NULL;
                /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
+               if (pfn_to_mfn(page_to_pfn(page)) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                        list_add(&page->lru, &image->unuseable_pages);
                        continue;
                }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
 
                /* If it is the destination page we want use it */
                if (addr == destination)
@@ -754,7 +787,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                        struct page *old_page;
 
                        old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       old_page = pfn_to_page(mfn_to_pfn(old_addr >> 
PAGE_SHIFT));
                        copy_highpage(page, old_page);
                        *old = addr | (*old & ~PAGE_MASK);
 
@@ -810,7 +843,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                        result  = -ENOMEM;
                        goto out;
                }
-               result = kimage_add_page(image, page_to_pfn(page)
+               result = kimage_add_page(image, pfn_to_mfn(page_to_pfn(page))
                                                                << PAGE_SHIFT);
                if (result < 0)
                        goto out;
@@ -842,6 +875,7 @@ out:
        return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
                                        struct kexec_segment *segment)
 {
@@ -864,7 +898,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                char *ptr;
                size_t uchunk, mchunk;
 
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               page = pfn_to_page(mfn_to_pfn(maddr >> PAGE_SHIFT));
                if (!page) {
                        result  = -ENOMEM;
                        goto out;
@@ -913,6 +947,13 @@ static int kimage_load_segment(struct kimage *image,
 
        return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
@@ -1016,6 +1057,13 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, 
unsigned long, nr_segments,
                }
                kimage_terminate(image);
        }
+#ifdef CONFIG_XEN
+       if (image) {
+               result = xen_machine_kexec_load(image);
+               if (result)
+                       goto out;
+       }
+#endif
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
 
@@ -1106,8 +1154,8 @@ void __weak crash_free_reserved_phys_range(unsigned long 
begin,
        unsigned long addr;
 
        for (addr = begin; addr < end; addr += PAGE_SIZE) {
-               ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
-               init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+               ClearPageReserved(pfn_to_page(mfn_to_pfn(addr >> PAGE_SHIFT)));
+               init_page_count(pfn_to_page(mfn_to_pfn(addr >> PAGE_SHIFT)));
                free_page((unsigned long)__va(addr));
                totalram_pages++;
        }
@@ -1216,6 +1264,7 @@ static int __init crash_notes_memory_init(void)
 module_init(crash_notes_memory_init)
 
 
+#ifndef CONFIG_XEN
 /*
  * parsing the "crashkernel" commandline
  *
@@ -1378,6 +1427,7 @@ int __init parse_crashkernel(char                  
*cmdline,
 
        return 0;
 }
+#endif
 
 
 
@@ -1435,7 +1485,18 @@ static int __init crash_save_vmcoreinfo_init(void)
 
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
+#ifndef CONFIG_X86_XEN
        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#else
+/*
+ * Since for x86-32 Xen swapper_pg_dir is a pointer rather than an array,
+ * make the value stored consistent with native (i.e. the base address of
+ * the page directory).
+ */
+# define swapper_pg_dir *swapper_pg_dir
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+# undef swapper_pg_dir
+#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [RFC][PATCH] xen: Kexec patch for pvops kernel, Daniel Kiper <=