WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH][HVM] vmx domain save/restore support

To: Ian Pratt <Ian.Pratt@xxxxxxxxxxxx>, Keir Fraser <Keir.Fraser@xxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH][HVM] vmx domain save/restore support
From: "Zhai, Edwin" <edwin.zhai@xxxxxxxxx>
Date: Wed, 19 Jul 2006 17:22:37 +0800
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx, "Zhai, Edwin" <edwin.zhai@xxxxxxxxx>
Delivery-date: Wed, 19 Jul 2006 02:26:48 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Organization: intel
Reply-to: edwin.zhai@xxxxxxxxx
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5 (X11/20051201)
attached is the vmx domain save/restore patch. works well for different guest/host combination W/O breaking domu save/restore according to my test.
pls. first apply xiaowei's qemu dm fix.

===know issue===
*  shpage pfn *
HV look for shpage pfn in an e820 entry when init. but some guest(win/em64t linux) will reuse this e820 ram, which cause losing shpage pfn when restore. so this entry is marked as "reserved" to avoid guest reuse (in this patch xc_hvm_build.c). we can change this if having good solution in future.

*  64bit host vmx restore python err *
when restore vmx guest on em64t host, i got a error "ERROR (xmlrpclib2:167) int exceeds XML-RPC limits" W/O blocking restore. "xend restart" can be a work around.

*  guest smp support *
i'm doing guest smp support including apic/vmcs save/restore now. so turn on "apic" in config file may cause save/restore failure.

*  guest save/restore across platform*
e.g save 32 guest on 64 host, then restore 32 guest on 32 host. we can't support this because save/restore face different vcpu_context format on different host. need universal format for this.

=== test report ===

"+" stands for okay, "-" stands for fail

   32b host:
       + 32/32
       + 32win/32

   pae host:
       + 32/pae
       + pae/pae
       + 32win/pae
       + pae_win/pae


   em64t host:
       + 32/64
       + pae/64
       + 64/64
       + 32win/64
       + pae_win/64

sometimes pae_win/64 are not stable:(



# HG changeset patch
# User Edwin Zhai <edwin.zhai@xxxxxxxxx>
# Node ID 2abb1c801ab72ee7e88b144871162fe2e47a0970
# Parent  98c3ddf83a59b0cbbdce63bb210adfd0d2ec1aea
vmx save/restore support

Signed-off-by: Zhai Edwin <edwin.zhai@xxxxxxxxx>
Signed-off-by: Dong Eddie <eddie.dong@xxxxxxxxx>
Signed-off-by: Nakajima Jun <jun.nakajima@xxxxxxxxx>

diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/hw/cirrus_vga.c       Wed Jul 19 16:09:59 2006 +0800
@@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
     cirrus_mmio_writel,
 };
 
+void cirrus_stop_acc(CirrusVGAState *s)
+{
+    if (s->map_addr){
+        int error;
+        s->map_addr = 0;
+        error = unset_vram_mapping(s->cirrus_lfb_addr,
+                s->cirrus_lfb_end);
+        fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
+
+        munmap(s->vram_ptr, VGA_RAM_SIZE);
+    }
+}
+
+void cirrus_restart_acc(CirrusVGAState *s)
+{
+    if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
+        void *vram_pointer, *old_vram;
+        fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, 
lfb_end=0x%lx.\n",
+                s->cirrus_lfb_addr, s->cirrus_lfb_end);
+        vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end);
+        if (!vram_pointer){
+            fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
+        } else {
+            old_vram = vga_update_vram((VGAState *)s, vram_pointer,
+                    VGA_RAM_SIZE);
+            qemu_free(old_vram);
+            s->map_addr = s->cirrus_lfb_addr;
+            s->map_end = s->cirrus_lfb_end;
+        }
+    }
+}
+
 /* load/save state */
 
 static void cirrus_vga_save(QEMUFile *f, void *opaque)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc;
 
     qemu_put_be32s(f, &s->latch);
     qemu_put_8s(f, &s->sr_index);
@@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f,
     qemu_put_be32s(f, &s->hw_cursor_y);
     /* XXX: we do not save the bitblt state - we assume we do not save
        the state when the blitter is active */
+
+    vga_acc = (!!s->map_addr);
+    qemu_put_8s(f, &vga_acc);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc)
+        cirrus_stop_acc(s);
 }
 
 static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc = 0;
 
     if (version_id != 1)
         return -EINVAL;
@@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f, 
 
     qemu_get_be32s(f, &s->hw_cursor_x);
     qemu_get_be32s(f, &s->hw_cursor_y);
+
+    qemu_get_8s(f, &vga_acc);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc){
+        cirrus_restart_acc(s);
+    }
 
     /* force refresh */
     s->graphic_mode = -1;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c      Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/target-i386-dm/helper2.c      Wed Jul 19 16:09:59 2006 +0800
@@ -457,6 +457,7 @@ int main_loop(void)
 {
     extern int vm_running;
     extern int shutdown_requested;
+    extern int suspend_requested;
     CPUState *env = cpu_single_env;
     int evtchn_fd = xc_evtchn_fd(xce_handle);
 
@@ -472,6 +473,10 @@ int main_loop(void)
                 qemu_system_reset();
                 reset_requested = 0;
             }
+            if (suspend_requested) {
+                fprintf(logfile, "device model received suspend signal!\n");
+                break;
+            }
         }
 
         /* Wait up to 10 msec. */
@@ -483,7 +488,15 @@ int main_loop(void)
                              shared_page->vcpu_iodata[send_vcpu].dm_eport);
         }
     }
-    destroy_hvm_domain();
+    if (!suspend_requested)
+        destroy_hvm_domain();
+    else {
+        char qemu_file[20];
+        sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid);
+        if (qemu_savevm(qemu_file) < 0)
+            fprintf(stderr, "qemu save fail.\n");
+    }
+
     return 0;
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/ioemu/vl.c  Wed Jul 19 16:09:59 2006 +0800
@@ -3884,6 +3884,11 @@ int qemu_loadvm(const char *filename)
         qemu_fseek(f, cur_pos + record_len, SEEK_SET);
     }
     fclose(f);
+
+    /* del tmp file */
+    if (unlink(filename) == -1)
+        fprintf(stderr, "delete tmp qemu state file failed.\n");
+
     ret = 0;
  the_end:
     if (saved_vm_running)
@@ -4470,6 +4475,7 @@ static QEMUResetEntry *first_reset_entry
 static QEMUResetEntry *first_reset_entry;
 int reset_requested;
 int shutdown_requested;
+int suspend_requested;
 static int powerdown_requested;
 
 void qemu_register_reset(QEMUResetHandler *func, void *opaque)
@@ -5242,6 +5248,14 @@ int set_mm_mapping(int xc_handle, uint32
 #endif
 
     return 0;
+}
+
+void suspend(int sig)
+{
+   fprintf(logfile, "suspend sig handler called with requested=%d!\n", 
suspend_requested);
+    if (sig != SIGUSR1)
+        fprintf(logfile, "suspend signal dismatch, get sig=%d!\n", sig);
+    suspend_requested = 1;
 }
 
 int main(int argc, char **argv)
@@ -6010,6 +6024,27 @@ int main(int argc, char **argv)
             vm_start();
         }
     }
+
+    /* register signal for the suspend request when save */
+    {
+        struct sigaction act;
+        sigset_t set;
+        act.sa_handler = suspend;
+        act.sa_flags = SA_RESTART;
+        sigemptyset(&act.sa_mask);
+
+        if (sigaction(SIGUSR1, &act, 0) == -1)
+            fprintf(stderr, "sigaction fail!\n");
+
+        /* control panel mask some signals when spawn qemu, need unmask here*/
+        sigemptyset(&set);
+        sigaddset(&set, SIGUSR1);
+        sigaddset(&set, SIGTERM);
+        if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1)
+            fprintf(stderr, "unblock signal fail!\n");
+
+    }
+
     main_loop();
     quit_timers();
     return 0;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/Makefile      Wed Jul 19 16:09:59 2006 +0800
@@ -33,7 +33,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui
 GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_IA64) += xc_ia64_stubs.c xc_linux_build.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c
 
 CFLAGS   += -Werror
 CFLAGS   += -fno-strict-aliasing
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_domain.c   Wed Jul 19 16:09:59 2006 +0800
@@ -182,6 +182,50 @@ int xc_domain_getinfolist(int xc_handle,
         ret = -1;
 
     return ret;
+}
+
+/* get info from hvm guest for save */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_GETHVMCONTEXT;
+    op.u.gethvmcontext.domain = (domid_t)domid;
+    set_xen_guest_handle(op.u.gethvmcontext.hvm_ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_dom0_op(xc_handle, &op);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
+}
+
+/* set info to hvm guest for restore */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOM0_OP;
+
+    op.cmd = DOM0_SETHVMCONTEXT;
+    op.u.sethvmcontext.domain = (domid_t)domid;
+    set_xen_guest_handle(op.u.gethvmcontext.hvm_ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_dom0_op(xc_handle, &op);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
 }
 
 int xc_vcpu_getcontext(int xc_handle,
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_hvm_build.c        Wed Jul 19 16:09:59 2006 +0800
@@ -60,11 +60,11 @@ static unsigned char build_e820map(void 
 
     /* XXX: Doesn't work for > 4GB yet */
     e820entry[nr_map].addr = 0x0;
-    e820entry[nr_map].size = 0x9F800;
+    e820entry[nr_map].size = 0x90000;
     e820entry[nr_map].type = E820_RAM;
     nr_map++;
 
-    e820entry[nr_map].addr = 0x9F800;
+    e820entry[nr_map].addr = 0x90000;
     e820entry[nr_map].size = 0x800;
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xc_linux_save.c       Wed Jul 19 16:09:59 2006 +0800
@@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu
 #endif
 
 
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    if(write(fd, buf, count) != count)
-        return 0;
-    return 1;
-}
-
-
-
 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
                        xc_shadow_control_stats_t *stats, int print)
 {
@@ -358,7 +349,7 @@ static int analysis_phase(int xc_handle,
 }
 
 
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
                              int dom, xc_dominfo_t *info,
                              vcpu_guest_context_t *ctxt)
 {
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xenctrl.h     Wed Jul 19 16:09:59 2006 +0800
@@ -286,6 +286,30 @@ int xc_domain_getinfolist(int xc_handle,
                           xc_domaininfo_t *info);
 
 /**
+ * This function returns information about the context of a hvm domain
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to get information from
+ * @parm hvm_ctxt a pointer to a structure to store the execution context of 
the
+ *            hvm domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
+ * This function will set the context for hvm domain
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to set the hvm domain context for
+ * @parm hvm_ctxt pointer to the the hvm context with the values to set
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
  *
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xenguest.h    Wed Jul 19 16:09:59 2006 +0800
@@ -11,6 +11,7 @@
 
 #define XCFLAGS_LIVE      1
 #define XCFLAGS_DEBUG     2
+#define XCFLAGS_HVM       4
 
 
 /**
@@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_
                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
                   int (*suspend)(int domid));
 
+/**
+ * This function will save a hvm domain running unmodified guest.
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                  int (*suspend)(int domid));
 
 /**
  * This function will restore a saved domain running Linux.
@@ -41,6 +49,17 @@ int xc_linux_restore(int xc_handle, int 
                      unsigned long nr_pfns, unsigned int store_evtchn,
                      unsigned long *store_mfn, unsigned int console_evtchn,
                      unsigned long *console_mfn);
+
+/**
+ * This function will restore a saved hvm domain running unmodified guest.
+ *
+ * @parm store_mfn pass mem size & returned with the mfn of the store page
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned long nr_pfns, unsigned int store_evtchn,
+                      unsigned long *store_mfn, unsigned int console_evtchn,
+                      unsigned long *console_mfn);
 
 /**
  * This function will create a domain for a paravirtualized Linux
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/libxc/xg_save_restore.h     Wed Jul 19 16:09:59 2006 +0800
@@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand
     return 1;
 }
 
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count)
+        return 0;
+    return 1;
+}
+
+extern int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt);
 
 /*
 ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Jul 19 16:09:59 2006 +0800
@@ -132,6 +132,20 @@ static PyObject *pyxc_domain_destroy(XcO
 static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args)
 {
     return dom_op(self, args, xc_domain_destroy);
+}
+
+static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args)
+{
+    uint32_t dom, reason;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &reason))
+      return NULL;
+
+    if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0)
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
 }
 
 
@@ -966,6 +980,14 @@ static PyMethodDef pyxc_methods[] = {
       METH_VARARGS, "\n"
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "domain_shutdown", 
+      (PyCFunction)pyxc_domain_shutdown,
+      METH_VARARGS, "\n"
+      "Shutdown a domain.\n"
+      " dom       [int, 0]:      Domain identifier to use.\n"
+      " reason     [int, 0]:      Reason for shutdown.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "vcpu_setaffinity", 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/XendCheckpoint.py   Wed Jul 19 16:09:59 2006 +0800
@@ -25,11 +25,14 @@ from XendDomainInfo import DEV_MIGRATE_S
 from XendDomainInfo import DEV_MIGRATE_STEP3
 
 SIGNATURE = "LinuxGuestRecord"
+QEMU_SIGNATURE = "QemuDeviceModelRecord"
+dm_batch = 512
 XC_SAVE = "xc_save"
 XC_RESTORE = "xc_restore"
 
 
 sizeof_int = calcsize("i")
+sizeof_unsigned_int = calcsize("I")
 sizeof_unsigned_long = calcsize("L")
 
 
@@ -72,6 +75,10 @@ def save(fd, dominfo, network, live, dst
                     "could not write guest state file: config len")
         write_exact(fd, config, "could not write guest state file: config")
 
+        hvm = 0
+        if dominfo.info['image'][0] == 'hvm':
+            hvm = 1
+        log.info("save hvm domain %d", hvm)
         # xc_save takes three customization parameters: maxit, max_f, and
         # flags the last controls whether or not save is 'live', while the
         # first two further customize behaviour when 'live' save is
@@ -79,7 +86,7 @@ def save(fd, dominfo, network, live, dst
         # libxenguest; see the comments and/or code in xc_linux_save() for
         # more information.
         cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(xc.handle()), str(fd),
-               str(dominfo.getDomid()), "0", "0", str(int(live)) ]
+               str(dominfo.getDomid()), "0", "0", str(int(live) | int(hvm << 
2)) ]
         log.debug("[xc_save]: %s", string.join(cmd))
 
         def saveInputHandler(line, tochild):
@@ -93,11 +100,28 @@ def save(fd, dominfo, network, live, dst
                 log.info("Domain %d suspended.", dominfo.getDomid())
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3,
                                        domain_name)
+                #send signal to device model for save
+                if hvm == 1:
+                    log.info("release_devices for hvm domain")
+                    dominfo.release_devices(True)
                 tochild.write("done\n")
                 tochild.flush()
                 log.debug('Written done')
 
         forkHelper(cmd, fd, saveInputHandler, False)
+
+        # put qemu device model state
+        if hvm:
+            write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), 
os.O_RDONLY)
+            while True:
+                buf = os.read(qemu_fd, dm_batch)
+                if len(buf):
+                    write_exact(fd, buf, "could not write device model state")
+                else:
+                    break
+            os.close(qemu_fd)
+            os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
 
         dominfo.destroyDomain()
 
@@ -139,10 +163,21 @@ def restore(xd, fd):
     assert store_port
     assert console_port
 
-    try:
-        l = read_exact(fd, sizeof_unsigned_long,
-                       "not a valid guest state file: pfn count read")
-        nr_pfns = unpack("L", l)[0]    # native sizeof long
+    #if hvm, pass mem size to calculate the store_mfn
+    hvm = 0
+    if dominfo.info['image'][0] == 'hvm':
+        hvm = dominfo.info['memory']
+        log.info("restore hvm domain %d, mem=%d", dominfo.domid, hvm)
+
+    try:
+        if hvm:
+            l = read_exact(fd, sizeof_unsigned_int,
+                    "not a valid hvm guest state file: pfn count read")
+            nr_pfns = unpack("I", l)[0]    # native sizeof int
+        else:
+            l = read_exact(fd, sizeof_unsigned_long,
+                           "not a valid guest state file: pfn count read")
+            nr_pfns = unpack("L", l)[0]    # native sizeof long
         if nr_pfns > 16*1024*1024:     # XXX 
             raise XendError(
                 "not a valid guest state file: pfn count out of range")
@@ -151,7 +186,7 @@ def restore(xd, fd):
 
         cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
                         xc.handle(), fd, dominfo.getDomid(), nr_pfns,
-                        store_port, console_port])
+                        store_port, console_port, hvm])
         log.debug("[xc_restore]: %s", string.join(cmd))
 
         handler = RestoreInputHandler()
@@ -163,6 +198,23 @@ def restore(xd, fd):
 
         dominfo.unpause()
 
+        # get qemu state and create a tmp file for dm restore
+        if hvm:
+            qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
+                "not a valid device model state: signature read")
+            if qemu_signature != QEMU_SIGNATURE:
+                raise XendError("not a valid device model state: found '%s'" %
+                                signature)
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(),
+                    os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
+            while True:
+                buf = os.read(fd, dm_batch)
+                if len(buf):
+                    write_exact(qemu_fd, buf, "could not write dm state to tmp 
file")
+                else:
+                    break
+            os.close(qemu_fd)
+        
         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
 
         return dominfo
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Jul 19 16:09:59 2006 +0800
@@ -668,6 +668,13 @@ class XendDomainInfo:
         self.console_mfn = console_mfn
 
         self.introduceDomain()
+        if self.info['image'][0] == 'hvm':
+            self.image = image.create(self,
+                                      self.info['image'],
+                                      self.info['device'])
+            if self.image:
+                log.debug("createDevicemodel for hvm domain restore")
+                self.image.createDeviceModel(True)
         self.storeDomDetails()
         self.registerWatches()
         self.refreshShutdown()
@@ -945,6 +952,13 @@ class XendDomainInfo:
             raise XendError('Invalid reason: %s' % reason)
         self.storeDom("control/shutdown", reason)
 
+        ## shutdown hypercall for hvm domain desides xenstore write
+        if self.info['image'][0] == 'hvm':
+            for code in shutdown_reasons.keys():
+                if shutdown_reasons[code] == reason:
+                    break
+            xc.domain_shutdown(self.domid, code)
+
 
     ## private:
 
@@ -1417,8 +1431,11 @@ class XendDomainInfo:
 
     ## private:
 
-    def release_devices(self):
+    def release_devices(self, suspend = False):
         """Release all domain's devices.  Nothrow guarantee."""
+        if suspend and self.image:
+            self.image.destroy(suspend)
+            return
 
         while True:
             t = xstransact("%s/device" % self.dompath)
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/python/xen/xend/image.py    Wed Jul 19 16:09:59 2006 +0800
@@ -157,7 +157,7 @@ class ImageHandler:
         """Build the domain. Define in subclass."""
         raise NotImplementedError()
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         """Create device model for the domain (define in subclass if 
needed)."""
         pass
     
@@ -331,7 +331,7 @@ class HVMImageHandler(ImageHandler):
             ret = ret + ['-vnc', '%d' % vncdisplay, '-k', 'en-us']
         return ret
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         if self.pid:
             return
         # Execute device model.
@@ -340,6 +340,8 @@ class HVMImageHandler(ImageHandler):
         args = args + ([ "-d",  "%d" % self.vm.getDomid(),
                   "-m", "%s" % (self.vm.getMemoryTarget() / 1024)])
         args = args + self.dmargs
+        if restore:
+            args = args + ([ "-loadvm", "/tmp/xen.qemu-dm.%d" % 
self.vm.getDomid() ])
         env = dict(os.environ)
         if self.display:
             env['DISPLAY'] = self.display
@@ -351,12 +353,16 @@ class HVMImageHandler(ImageHandler):
         self.pid = os.spawnve(os.P_NOWAIT, self.device_model, args, env)
         log.info("device model pid: %d", self.pid)
 
-    def destroy(self):
+    def destroy(self, suspend = False):
         self.unregister_shutdown_watch();
         import signal
         if not self.pid:
             return
-        os.kill(self.pid, signal.SIGKILL)
+        sig = signal.SIGKILL
+        if suspend:
+            log.info("use sigusr1 to signal qemu %d", self.pid)
+            sig = signal.SIGUSR1
+        os.kill(self.pid, sig)
         os.waitpid(self.pid, 0)
         self.pid = 0
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/xcutils/xc_restore.c        Wed Jul 19 16:09:59 2006 +0800
@@ -18,12 +18,13 @@ main(int argc, char **argv)
 main(int argc, char **argv)
 {
     unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn, console_evtchn;
+    unsigned int hvm;
     int ret;
     unsigned long store_mfn, console_mfn;
 
-    if (argc != 7)
+    if (argc != 8)
        errx(1,
-            "usage: %s xcfd iofd domid nr_pfns store_evtchn console_evtchn",
+            "usage: %s xcfd iofd domid nr_pfns store_evtchn console_evtchn 
is_hvm",
             argv[0]);
 
     xc_fd = atoi(argv[1]);
@@ -32,9 +33,17 @@ main(int argc, char **argv)
     nr_pfns = atoi(argv[4]);
     store_evtchn = atoi(argv[5]);
     console_evtchn = atoi(argv[6]);
+    hvm = atoi(argv[7]);
 
-    ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
-                          &store_mfn, console_evtchn, &console_mfn);
+    if (hvm) {
+        /* pass the memsize to xc_hvm_restore to find the store_mfn */
+        store_mfn = hvm;
+        ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+    } else
+        ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+
     if (ret == 0) {
        printf("store-mfn %li\n", store_mfn);
        printf("console-mfn %li\n", console_mfn);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/tools/xcutils/xc_save.c   Wed Jul 19 16:09:59 2006 +0800
@@ -47,5 +47,8 @@ main(int argc, char **argv)
     max_f = atoi(argv[5]);
     flags = atoi(argv[6]);
 
-    return xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    if (flags & XCFLAGS_HVM)
+        return xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    else
+        return xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
&suspend);
 }
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/dom0_ops.c   Wed Jul 19 16:09:59 2006 +0800
@@ -454,6 +454,7 @@ void arch_getdomaininfo_ctxt(
     if ( hvm_guest(v) )
     {
         hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg);
+        hvm_save_cpu_context(v, &c->hvmcpu_ctxt);
     }
     else
     {
@@ -473,6 +474,25 @@ void arch_getdomaininfo_ctxt(
     c->ctrlreg[3] = xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table));
 
     c->vm_assist = v->domain->vm_assist;
+}
+
+void arch_gethvm_ctxt(
+    struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !hvm_guest(v) )
+        return;
+
+    hvm_save(v, c);
+
+}
+
+void arch_sethvm_ctxt(
+        struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !hvm_guest(v) )
+        return;
+
+    hvm_load(v, c);
 }
 
 /*
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/hvm.c    Wed Jul 19 16:09:59 2006 +0800
@@ -182,6 +182,8 @@ static void hvm_get_info(struct domain *
     unmap_domain_page(p);
 }
 
+extern void arch_sethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
 void hvm_setup_platform(struct domain* d)
 {
     struct hvm_domain *platform;
@@ -211,6 +213,16 @@ void hvm_setup_platform(struct domain* d
 
     init_timer(&platform->pl_time.periodic_tm.timer, pt_timer_fn, v, 
v->processor);
     pit_init(v, cpu_khz);
+
+    /* restore hvm context including pic/pit/shpage */
+    shpage_init(get_sp(d));
+
+    if (platform->hvm_ctxt) {
+        arch_sethvm_ctxt(current, platform->hvm_ctxt);
+        xfree(platform->hvm_ctxt);
+        platform->hvm_ctxt = NULL;
+    }
+
 }
 
 void pic_irq_request(void *data, int level)
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/i8254.c  Wed Jul 19 16:09:59 2006 +0800
@@ -357,6 +357,142 @@ static uint32_t pit_ioport_read(void *op
     return ret;
 }
 
+void pit_info(PITState *pit)
+{
+    PITChannelState *s;
+    int i;
+
+    for(i = 0; i < 3; i++) {
+        printk("*****pit channel %d's state:*****\n", i);
+        s = &pit->channels[i];
+        printk("pit 0x%x.\n", s->count);
+        printk("pit 0x%x.\n", s->latched_count);
+        printk("pit 0x%x.\n", s->count_latched);
+        printk("pit 0x%x.\n", s->status_latched);
+        printk("pit 0x%x.\n", s->status);
+        printk("pit 0x%x.\n", s->read_state);
+        printk("pit 0x%x.\n", s->write_state);
+        printk("pit 0x%x.\n", s->write_latch);
+        printk("pit 0x%x.\n", s->rw_mode);
+        printk("pit 0x%x.\n", s->mode);
+        printk("pit 0x%x.\n", s->bcd);
+        printk("pit 0x%x.\n", s->gate);
+        printk("pit %"PRId64"\n", s->count_load_time);
+
+        if (s->pt) {
+            struct periodic_time *pt = s->pt;
+            printk("pit channel %d has a periodic timer:\n", i);
+            printk("pt %d.\n", pt->enabled);
+            printk("pt %d.\n", pt->one_shot);
+            printk("pt %d.\n", pt->irq);
+            printk("pt %d.\n", pt->first_injected);
+
+            printk("pt %d.\n", pt->pending_intr_nr);
+            printk("pt %d.\n", pt->period);
+            printk("pt %"PRId64"\n", pt->period_cycles);
+            printk("pt %"PRId64"\n", pt->last_plt_gtime);
+        }
+    }
+
+}
+
+static void pit_save(hvm_domain_context_t *h, void *opaque)
+{
+    PITState *pit = opaque;
+    PITChannelState *s;
+    struct periodic_time *pt;
+    int i, pti = -1;
+    
+#ifdef HVM_DEBUG_SUSPEND
+    pit_info(pit);
+#endif
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        hvm_put_32u(h, s->count);
+        hvm_put_16u(h, s->latched_count);
+        hvm_put_8u(h, s->count_latched);
+        hvm_put_8u(h, s->status_latched);
+        hvm_put_8u(h, s->status);
+        hvm_put_8u(h, s->read_state);
+        hvm_put_8u(h, s->write_state);
+        hvm_put_8u(h, s->write_latch);
+        hvm_put_8u(h, s->rw_mode);
+        hvm_put_8u(h, s->mode);
+        hvm_put_8u(h, s->bcd);
+        hvm_put_8u(h, s->gate);
+        hvm_put_64u(h, s->count_load_time);
+
+        if (s->pt && pti == -1)
+            pti = i;
+    }
+
+    /* save guest time */
+    pt = pit->channels[pti].pt;
+    hvm_put_8u(h, pti);
+    hvm_put_8u(h, pt->first_injected);
+    hvm_put_32u(h, pt->pending_intr_nr);
+    hvm_put_64u(h, pt->last_plt_gtime);
+
+}
+
+static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    PITState *pit = opaque;
+    PITChannelState *s;
+    int i, pti;
+    u32 period;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        s->count = hvm_get_32u(h);
+        s->latched_count = hvm_get_16u(h);
+        s->count_latched = hvm_get_8u(h);
+        s->status_latched = hvm_get_8u(h);
+        s->status = hvm_get_8u(h);
+        s->read_state = hvm_get_8u(h);
+        s->write_state = hvm_get_8u(h);
+        s->write_latch = hvm_get_8u(h);
+        s->rw_mode = hvm_get_8u(h);
+        s->mode = hvm_get_8u(h);
+        s->bcd = hvm_get_8u(h);
+        s->gate = hvm_get_8u(h);
+        s->count_load_time = hvm_get_64u(h);
+    }
+
+    pti = hvm_get_8u(h);
+    s = &pit->channels[pti];
+    period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ);
+
+    printk("recreate periodic timer %d in mode %d, freq=%d.\n", pti, s->mode, 
period);
+    switch (s->mode) {
+        case 2:
+            /* create periodic time */
+            s->pt = create_periodic_time(s, period, 0, 0);
+            break;
+        case 1:
+            /* create one shot time */
+            s->pt = create_periodic_time(s, period, 0, 1);
+            break;
+        default:
+            break;
+    }
+
+    s->pt->first_injected = hvm_get_8u(h);
+    s->pt->pending_intr_nr = hvm_get_32u(h);
+    s->pt->last_plt_gtime = hvm_get_64u(h);
+    /*XXX: need set_guest_time here or do this when post_inject? */
+ 
+#ifdef HVM_DEBUG_SUSPEND
+    pit_info(pit);
+#endif
+
+    return 0;
+}
+
 static void pit_reset(void *opaque)
 {
     PITState *pit = opaque;
@@ -385,6 +521,8 @@ void pit_init(struct vcpu *v, unsigned l
     s->vcpu = v;
     s++; s->vcpu = v;
     s++; s->vcpu = v;
+
+    hvm_register_savevm("xen_hvm_i8254", PIT_BASE, 1, pit_save, pit_load, pit);
 
     register_portio_handler(PIT_BASE, 4, handle_pit_io);
     /* register the speaker port */
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/i8259.c
--- a/xen/arch/x86/hvm/i8259.c  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/i8259.c  Wed Jul 19 16:09:59 2006 +0800
@@ -454,12 +454,91 @@ static uint32_t elcr_ioport_read(void *o
     return s->elcr;
 }
 
+void pic_info(PicState *s)
+{
+    printk("*****pic state:*****\n");
+    printk("pic 0x%x.\n", s->last_irr);
+    printk("pic 0x%x.\n", s->irr);
+    printk("pic 0x%x.\n", s->imr);
+    printk("pic 0x%x.\n", s->isr);
+    printk("pic 0x%x.\n", s->priority_add);
+    printk("pic 0x%x.\n", s->irq_base);
+    printk("pic 0x%x.\n", s->read_reg_select);
+    printk("pic 0x%x.\n", s->poll);
+    printk("pic 0x%x.\n", s->special_mask);
+    printk("pic 0x%x.\n", s->init_state);
+    printk("pic 0x%x.\n", s->auto_eoi);
+    printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
+    printk("pic 0x%x.\n", s->special_fully_nested_mode);
+    printk("pic 0x%x.\n", s->init4);
+    printk("pic 0x%x.\n", s->elcr);
+    printk("pic 0x%x.\n", s->elcr_mask);
+}
+
+static void pic_save(hvm_domain_context_t *h, void *opaque)
+{
+    PicState *s = opaque;
+    
+#ifdef HVM_DEBUG_SUSPEND
+    pic_info(s);
+#endif
+
+    hvm_put_8u(h, s->last_irr);
+    hvm_put_8u(h, s->irr);
+    hvm_put_8u(h, s->imr);
+    hvm_put_8u(h, s->isr);
+    hvm_put_8u(h, s->priority_add);
+    hvm_put_8u(h, s->irq_base);
+    hvm_put_8u(h, s->read_reg_select);
+    hvm_put_8u(h, s->poll);
+    hvm_put_8u(h, s->special_mask);
+    hvm_put_8u(h, s->init_state);
+    hvm_put_8u(h, s->auto_eoi);
+    hvm_put_8u(h, s->rotate_on_auto_eoi);
+    hvm_put_8u(h, s->special_fully_nested_mode);
+    hvm_put_8u(h, s->init4);
+    hvm_put_8u(h, s->elcr);
+    hvm_put_8u(h, s->elcr_mask);
+}
+
+static int pic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    PicState *s = opaque;
+    
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->last_irr = hvm_get_8u(h);
+    s->irr = hvm_get_8u(h);
+    s->imr = hvm_get_8u(h);
+    s->isr = hvm_get_8u(h);
+    s->priority_add = hvm_get_8u(h);
+    s->irq_base = hvm_get_8u(h);
+    s->read_reg_select= hvm_get_8u(h);
+    s->poll = hvm_get_8u(h);
+    s->special_mask = hvm_get_8u(h);
+    s->init_state = hvm_get_8u(h);
+    s->auto_eoi = hvm_get_8u(h);
+    s->rotate_on_auto_eoi = hvm_get_8u(h);
+    s->special_fully_nested_mode = hvm_get_8u(h);
+    s->init4 = hvm_get_8u(h);
+    s->elcr = hvm_get_8u(h);
+    s->elcr_mask = hvm_get_8u(h);
+
+#ifdef HVM_DEBUG_SUSPEND
+    pic_info(s);
+#endif
+
+    return 0;
+}
+
 /* XXX: add generic master/slave system */
 /* Caller must hold vpic lock */
 static void pic_init1(int io_addr, int elcr_addr, PicState *s)
 {
     BUG_ON(!spin_is_locked(&s->pics_state->lock));
 
+    hvm_register_savevm("xen_hvm_i8259", io_addr, 1, pic_save, pic_load, s);
     pic_reset(s);
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/intercept.c
--- a/xen/arch/x86/hvm/intercept.c      Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/intercept.c      Wed Jul 19 16:09:59 2006 +0800
@@ -29,6 +29,8 @@
 #include <asm/current.h>
 #include <io_ports.h>
 #include <xen/event.h>
+#include <xen/compile.h>
+#include <public/version.h>
 
 
 extern struct hvm_mmio_handler vlapic_mmio_handler;
@@ -303,6 +305,266 @@ void destroy_periodic_time(struct period
     }
 }
 
+/* save/restore support */
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+int hvm_register_savevm(const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque)
+{
+    HVMStateEntry *se, **pse;
+    struct vcpu *v = current;
+
+    if (!hvm_guest(v)) {
+        printk("register savevm only for hvm guest!\n");
+        return -1;
+    }
+
+    if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){
+        printk("allocat hvmstate entry fail.\n");
+        return -1;
+    }
+
+    strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN);
+
+    se->instance_id = instance_id;
+    se->version_id = version_id;
+    se->save_state = save_state;
+    se->load_state = load_state;
+    se->opaque = opaque;
+    se->next = NULL;
+
+    /* add at the end of list */
+    pse = &v->domain->arch.hvm_domain.first_se;
+    while (*pse != NULL)
+        pse = &(*pse)->next;
+    *pse = se;
+    return 0;
+}
+
+int hvm_save(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, len_pos, cur_pos;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char *chgset;
+
+    if (!hvm_guest(v)) {
+        printk("hvm_save only for hvm guest!\n");
+        return -1;
+    }
+
+    memset(h, 0, sizeof(hvm_domain_context_t));
+    hvm_put_32u(h, HVM_FILE_MAGIC);
+    hvm_put_32u(h, HVM_FILE_VERSION);
+
+    /* save xen changeset */
+    chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = strlen(chgset);
+    hvm_put_8u(h, len);
+    hvm_put_buffer(h, chgset, len);
+
+    /* save cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    hvm_put_32u(h, eax);
+
+    for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        /* ID string */
+        len = strnlen(se->idstr, HVM_SE_IDSTR_LEN);
+        hvm_put_8u(h, len);
+        hvm_put_buffer(h, se->idstr, len);
+
+        hvm_put_32u(h, se->instance_id);
+        hvm_put_32u(h, se->version_id);
+
+        /* record size */
+        len_pos = hvm_ctxt_tell(h);
+        hvm_put_32u(h, 0);
+
+        se->save_state(h, se->opaque);
+
+        cur_pos = hvm_ctxt_tell(h);
+        len = cur_pos - len_pos - 4;
+        hvm_ctxt_seek(h, len_pos);
+        hvm_put_32u(h, len);
+        hvm_ctxt_seek(h, cur_pos);
+
+    }
+
+    h->size = hvm_ctxt_tell(h);
+    hvm_ctxt_seek(h, 0);
+
+    return 0;
+
+}
+
+static HVMStateEntry *find_se(struct domain *d, const char *idstr, int 
instance_id)
+{
+    HVMStateEntry *se;
+
+    for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) &&
+            instance_id == se->instance_id){
+            return se;
+        }
+    }
+    return NULL;
+}
+
+int hvm_load(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, rec_len, rec_pos, magic, instance_id, version_id;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char idstr[HVM_SE_IDSTR_LEN];
+    xen_changeset_info_t chgset;
+    char *cur_chgset;
+
+    if (!hvm_guest(v)) {
+        printk("hvm_load only for hvm guest!\n");
+        return -1;
+    }
+
+    hvm_ctxt_seek(h, 0);
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_MAGIC) {
+        printk("HVM restore magic dismatch!\n");
+        return -1;
+    }
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_VERSION) {
+        printk("HVM restore version dismatch!\n");
+        return -1;
+    }
+
+    /* check xen change set */
+    cur_chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = hvm_get_8u(h);
+    hvm_get_buffer(h, chgset, len);
+    chgset[len] = '\0';
+    if (strncmp(cur_chgset, chgset, len + 1))
+        printk("warnings: try to restore hvm guest(%s) on a different 
changeset %s.\n",
+                chgset, cur_chgset);
+
+    /* check cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    ebx = hvm_get_32u(h);
+    /*TODO: need difine how big difference is acceptable */
+    if (ebx != eax)
+        printk("warnings: try to restore hvm guest(0x%"PRIx32") "
+               "on a different type processor(0x%"PRIx32").\n",
+                ebx,
+                eax);
+
+    while(1) {
+        if (hvm_ctxt_end(h)) {
+            break;
+        }
+
+        /* ID string */
+        len = hvm_get_8u(h);
+        if (len > HVM_SE_IDSTR_LEN)
+            printk("HVM save entry idstr len wrong!");
+
+        hvm_get_buffer(h, idstr, len);
+        idstr[len] = '\0';
+
+        instance_id = hvm_get_32u(h);
+        version_id = hvm_get_32u(h);
+
+        rec_len = hvm_get_32u(h);
+        rec_pos = hvm_ctxt_tell(h);
+
+        se = find_se(v->domain, idstr, instance_id);
+        if (se)
+            se->load_state(h, se->opaque, version_id);
+        else
+            printk("warnings: hvm load can't find device %s's instance %d!\n",
+                    idstr, version_id);
+                    
+
+        /* make sure to jump end of record */
+        if ( hvm_ctxt_tell(h) - rec_pos != rec_len) {
+            printk("wrong hvm record size, maybe some dismatch between 
save&restoreo handler!\n");
+        }
+        hvm_ctxt_seek(h, rec_pos + rec_len);
+    }
+
+    return 0;
+}
+
+void shpage_info(shared_iopage_t *sh)
+{
+
+    vcpu_iodata_t *p = &sh->vcpu_iodata[0];
+    ioreq_t *req = &p->vp_ioreq;
+    printk("*****sharepage_info******!\n");
+    printk("vp_eport=%d,dm_eport=%d\n", p->vp_eport, p->dm_eport);
+    printk("io packet: "
+                     "state:%x, pvalid: %x, dir:%x, port: %"PRIx64", "
+                     "data: %"PRIx64", count: %"PRIx64", size: %"PRIx64"\n",
+                     req->state, req->pdata_valid, req->dir, req->addr,
+                     req->u.data, req->count, req->size);
+    printk("pic_elcr=0x%x, pic_irr=0x%x, pic_last_irr=0x%x, 
pic_clear_irr=0x%x.\n",
+            sh->sp_global.pic_elcr,
+            sh->sp_global.pic_irr,
+            sh->sp_global.pic_last_irr,
+            sh->sp_global.pic_clear_irr);
+}
+
+static void shpage_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+
+#ifdef HVM_DEBUG_SUSPEND
+    shpage_info(s);
+#endif
+
+    hvm_put_16u(h, s->sp_global.pic_elcr);
+    hvm_put_16u(h, s->sp_global.pic_irr);
+    hvm_put_16u(h, s->sp_global.pic_last_irr);
+    hvm_put_16u(h, s->sp_global.pic_clear_irr);
+
+    hvm_put_buffer(h, (char*)req, sizeof(struct ioreq));
+}
+
+static int shpage_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->sp_global.pic_elcr = hvm_get_16u(h);
+    s->sp_global.pic_irr = hvm_get_16u(h);
+    s->sp_global.pic_last_irr = hvm_get_16u(h);
+    s->sp_global.pic_clear_irr = hvm_get_16u(h);
+
+    hvm_get_buffer(h, (char*)req, sizeof(struct ioreq));
+
+#ifdef HVM_DEBUG_SUSPEND
+    shpage_info(s);
+#endif
+
+    return 0;
+}
+
+void shpage_init(shared_iopage_t *sp)
+{
+    hvm_register_savevm("xen_hvm_shpage", 0x10, 1, shpage_save, shpage_load, 
sp);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Jul 19 16:09:59 2006 +0800
@@ -763,6 +763,7 @@ static void svm_relinquish_guest_resourc
 {
     extern void destroy_vmcb(struct arch_svm_struct *); /* XXX */
     struct vcpu *v;
+    HVMStateEntry *se, *dse;
 
     for_each_vcpu ( d, v )
     {
@@ -780,6 +781,13 @@ static void svm_relinquish_guest_resourc
     }
 
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
 
     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 19 16:09:59 2006 +0800
@@ -572,6 +572,7 @@ void arch_vmx_do_launch(struct vcpu *v)
     }
 
     vmx_do_launch(v);
+    hvm_load_cpu_context(v, &v->arch.guest_context.hvmcpu_ctxt);
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 19 16:09:59 2006 +0800
@@ -126,6 +126,7 @@ static void vmx_relinquish_guest_resourc
 static void vmx_relinquish_guest_resources(struct domain *d)
 {
     struct vcpu *v;
+    HVMStateEntry *se, *dse;
 
     for_each_vcpu ( d, v )
     {
@@ -142,6 +143,13 @@ static void vmx_relinquish_guest_resourc
     }
 
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
 
     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
@@ -521,6 +529,337 @@ static void vmx_store_cpu_guest_regs(
     }
 
     vmx_vmcs_exit(v);
+}
+
+int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long inst_len;
+    int error = 0;
+
+    error |= __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len);
+    error |= __vmread(GUEST_RIP, &c->eip);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n", 
+            inst_len, c->eip);
+#endif
+
+    error |= __vmread(GUEST_RSP, &c->esp);
+    error |= __vmread(GUEST_RFLAGS, &c->eflags);
+
+    error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+
+    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: cr3=0x%"PRIx64".\n", c->cr3);
+#endif
+
+    error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+    error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+    error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+    error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+    error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+    error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+    error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+    error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+    error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes);
+
+    error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+    error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+    error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+    error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes);
+
+    error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+    error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+    error |= __vmread(GUEST_ES_BASE, &c->es_base);
+    error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes);
+
+    error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+    error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+    error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+    error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes);
+
+    error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+    error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+    error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+    error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes);
+
+    error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+    error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+    error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+    error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes);
+
+    error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+    error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+    error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+    error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes);
+
+    error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+    error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+    error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+    error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes);
+
+    error |= __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs);
+    error |= __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp);
+    error |= __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip);
+
+    return !error;
+}
+
+int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long mfn, old_cr4, old_base_mfn;
+    int error = 0;
+
+    error |= __vmwrite(GUEST_RIP, c->eip);
+    error |= __vmwrite(GUEST_RSP, c->esp);
+    error |= __vmwrite(GUEST_RFLAGS, c->eflags);
+
+    error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+    if (!vmx_paging_enabled(v)) {
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        goto skip_cr3;
+    }
+
+    if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
+        /*
+         * This is simple TLB flush, implying the guest has
+         * removed some translation or changed page attributes.
+         * We simply invalidate the shadow.
+         */
+        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
+        if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash_synchronous();
+            return 0;
+        }
+        shadow_sync_all(v->domain);
+    } else {
+        /*
+         * If different, make a shadow. Check if the PDBR is valid
+         * first.
+         */
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %x", c->cr3);
+        if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash_synchronous();
+            return 0;
+        }
+        mfn = get_mfn_from_gpfn(c->cr3 >> PAGE_SHIFT);
+        if(!get_page(mfn_to_page(mfn), v->domain))
+                return 0;
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+        if (old_base_mfn)
+             put_page(mfn_to_page(old_base_mfn));
+        /*
+         * arch.shadow_table should now hold the next CR3 for shadow
+         */
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+        update_pagetables(v);
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+    }
+
+ skip_cr3:
+
+    error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+    error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
+    error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+    error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
+
+    error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
+
+    error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+    error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
+
+    error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
+
+    error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
+
+    error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
+
+    error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+    error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
+
+    error |= __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
+    error |= __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
+    error |= __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
+
+    return !error;
+}
+
+void dump_msr_state(struct vmx_msr_state *m)
+{
+    int i = 0;
+    printk("**** msr state ****\n");
+    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        printk("0x%lx,", m->msr_items[i]);
+    printk("\n");
+}
+        
+void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_content;
+    unsigned long guest_flags = guest_state->flags;
+    int i = 0;
+
+    data->shadow_gs = guest_state->shadow_gs;
+    data->cpu_state = v->arch.hvm_vmx.cpu_state;
+    /* save msrs */
+    data->flags = guest_flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        data->msr_items[i] = guest_state->msr_items[i];
+
+#ifdef HVM_DEBUG_SUSPEND
+    dump_msr_state(guest_state);
+    printk("saved cpu_state=0x%"PRIX64"\n", data->cpu_state);
+#endif
+}
+
+void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    int i = 0;
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_content;
+
+    /* restore msrs */
+    guest_state->flags = data->flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        guest_state->msr_items[i] = data->msr_items[i];
+
+    guest_state->shadow_gs = data->shadow_gs;
+
+    vmx_restore_msrs(v);
+
+    v->arch.hvm_vmx.cpu_state = data->cpu_state;
+
+#ifdef HVM_DEBUG_SUSPEND
+    dump_msr_state(guest_state);
+    printk("restore cpu_state=0x%lx.\n", v->arch.hvm_vmx.cpu_state);
+
+#endif
+
+#if defined(__x86_64__)
+        if ( test_bit(VMX_CPU_STATE_LME_ENABLED,
+                     &v->arch.hvm_vmx.cpu_state) )
+        {
+            unsigned long vm_entry_value;
+            if ( test_bit(VMX_CPU_STATE_LMA_ENABLED,
+                        &v->arch.hvm_vmx.cpu_state) ) {
+                __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
+                vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
+                __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous(); /* need to take a clean path */
+                }
+            }
+        }
+        else
+#endif  /* __x86_64__ */
+        {
+#if CONFIG_PAGING_LEVELS >= 3
+            /* seems it's a 32-bit or 32-bit PAE guest */
+            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
+                        &v->arch.hvm_vmx.cpu_state) )
+            {
+                /* The guest enables PAE first and then it enables PG, it is
+                 * really a PAE guest */
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous();
+                }
+            }
+            else
+            {
+                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
+                {
+                    printk("Unsupported guest paging levels\n");
+                    domain_crash_synchronous(); /* need to take a clean path */
+                }
+            }
+#endif
+        }
+
+}
+
+void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+
+    /* set valid flag to recover whole vmcs when restore */
+    ctxt->valid = 1;
+
+    vmx_save_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_save(v, data))
+        printk("vmx_vmcs save failed!\n");
+
+    vmx_vmcs_exit(v);
+
+}
+
+void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    if (!ctxt->valid)
+        return;
+
+    vmx_load_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_restore(v, &ctxt->data))
+        printk("vmx_vmcs restore failed!\n");
+
+    /* only load vmcs once */
+    ctxt->valid = 0;
+
+    vmx_vmcs_exit(v);
+
 }
 
 /*
@@ -741,6 +1080,9 @@ int start_vmx(void)
 
     hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
     hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
+
+    hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
+    hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
 
     hvm_funcs.realmode = vmx_realmode;
     hvm_funcs.paging_enabled = vmx_paging_enabled;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/common/dom0_ops.c     Wed Jul 19 16:09:59 2006 +0800
@@ -27,6 +27,8 @@ extern long arch_do_dom0_op(
     struct dom0_op *op, XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op);
 extern void arch_getdomaininfo_ctxt(
     struct vcpu *, struct vcpu_guest_context *);
+extern void arch_gethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
 
 static inline int is_free_domid(domid_t dom)
 {
@@ -504,6 +506,77 @@ long do_dom0_op(XEN_GUEST_HANDLE(dom0_op
     }
     break;
 
+    case DOM0_GETHVMCONTEXT:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->u.gethvmcontext.domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto gethvmcontext_out;
+
+        v = d->vcpu[0];
+
+        ret = -ENODATA;
+        if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+            goto gethvmcontext_out;
+        
+        arch_gethvm_ctxt(v, c);
+
+        ret = 0;
+        if ( copy_to_guest(op->u.gethvmcontext.hvm_ctxt, c, 1) )
+            ret = -EFAULT;
+
+        xfree(c);
+
+        if ( copy_to_guest(u_dom0_op, op, 1) )
+            ret = -EFAULT;
+
+    gethvmcontext_out:
+        put_domain(d);
+    }
+    break;
+
+    case DOM0_SETHVMCONTEXT:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->u.sethvmcontext.domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto sethvmcontext_out;
+
+        /*XXX: need check input vcpu when smp */
+        v = d->vcpu[0];
+        
+        ret = -EFAULT;
+        if ( copy_from_guest(c, op->u.sethvmcontext.hvm_ctxt, 1) != 0 )
+            goto sethvmcontext_out;
+
+        /* store the data for future use */
+        d->arch.hvm_domain.hvm_ctxt = c;
+
+        ret = 0;
+
+        if ( copy_to_guest(u_dom0_op, op, 1) )
+            ret = -EFAULT;
+
+    sethvmcontext_out:
+        put_domain(d);
+    }
+    break;
+
+
     case DOM0_GETVCPUINFO:
     { 
         struct domain *d;
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/domain.h  Wed Jul 19 16:09:59 2006 +0800
@@ -30,6 +30,20 @@
 
 #define HVM_PBUF_SIZE   80
 
+typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque);
+typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int 
version_id);
+
+#define HVM_SE_IDSTR_LEN 32
+typedef struct HVMStateEntry {
+    char idstr[HVM_SE_IDSTR_LEN];
+    int instance_id;
+    int version_id;
+    SaveStateHandler *save_state;
+    LoadStateHandler *load_state;
+    void *opaque;
+    struct HVMStateEntry *next;
+} HVMStateEntry;
+
 struct hvm_domain {
     unsigned long          shared_page_va;
     unsigned int           nr_vcpus;
@@ -48,6 +62,8 @@ struct hvm_domain {
 
     int                    pbuf_index;
     char                   pbuf[HVM_PBUF_SIZE];
+    struct hvm_domain_context *hvm_ctxt;
+    HVMStateEntry *first_se;
 };
 
 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Jul 19 16:09:59 2006 +0800
@@ -47,6 +47,13 @@ struct hvm_function_table {
         struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs);
     void (*load_cpu_guest_regs)(
         struct vcpu *v, struct cpu_user_regs *r);
+
+    /* save and load hvm guest cpu context for save/restore */
+    void (*save_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+    void (*load_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+
     /*
      * Examine specifics of the guest state:
      * 1) determine whether the guest is in real or vm8086 mode,
@@ -103,6 +110,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v, 
     hvm_funcs.load_cpu_guest_regs(v, r);
 }
 
+static inline void
+hvm_save_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.save_cpu_ctxt(v, ctxt);
+}
+
+static inline void
+hvm_load_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.load_cpu_ctxt(v, ctxt);
+}
+
 static inline int
 hvm_realmode(struct vcpu *v)
 {
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/asm-x86/hvm/support.h Wed Jul 19 16:09:59 2006 +0800
@@ -25,6 +25,7 @@
 #include <asm/types.h>
 #include <asm/regs.h>
 #include <asm/processor.h>
+#include <public/dom0_ops.h>
 
 #ifndef NDEBUG
 #define HVM_DEBUG 1
@@ -136,6 +137,129 @@ extern unsigned int opt_hvm_debug_level;
         domain_crash_synchronous();                             \
     } while (0)
 
+/* save/restore support */
+
+//#define HVM_DEBUG_SUSPEND
+
+extern int hvm_register_savevm(const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque);
+
+static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int pos)
+{
+    h->cur = pos;
+}
+
+static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h)
+{
+    return h->cur;
+}
+
+static inline int hvm_ctxt_end(hvm_domain_context_t *h)
+{
+    return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE);
+}
+
+static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_put_byte overflow.\n");
+        return;
+    }
+    h->data[h->cur++] = (char)i;
+}
+
+static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b)
+{
+    hvm_put_byte(h, b);
+}
+
+static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b)
+{
+    hvm_put_8u(h, b >> 8);
+    hvm_put_8u(h, b);
+}
+
+static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b)
+{
+    hvm_put_16u(h, b >> 16);
+    hvm_put_16u(h, b);
+}
+
+static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b)
+{
+    hvm_put_32u(h, b >> 32);
+    hvm_put_32u(h, b);
+}
+
+static inline void hvm_put_buffer(hvm_domain_context_t *h, const char *buf, 
int len)
+{
+    memcpy(&h->data[h->cur], buf, len);
+    h->cur += len;
+}
+
+
+static inline char hvm_get_byte(hvm_domain_context_t *h)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_get_byte overflow.\n");
+        return -1;
+    }
+
+    if (h->cur >= h->size) {
+        printk("hvm_get_byte exceed data area.\n");
+        return -1;
+    }
+
+    return h->data[h->cur++];
+}
+
+static inline uint8_t hvm_get_8u(hvm_domain_context_t *h)
+{
+    return hvm_get_byte(h);
+}
+
+static inline uint16_t hvm_get_16u(hvm_domain_context_t *h)
+{
+    uint16_t v;
+    v =  hvm_get_8u(h) << 8;
+    v |= hvm_get_8u(h);
+
+    return v;
+}
+
+static inline uint32_t hvm_get_32u(hvm_domain_context_t *h)
+{
+    uint32_t v;
+    v =  hvm_get_16u(h) << 16;
+    v |= hvm_get_16u(h);
+
+    return v;
+}
+
+static inline uint64_t hvm_get_64u(hvm_domain_context_t *h)
+{
+    uint64_t v;
+    v =  (uint64_t)hvm_get_32u(h) << 32;
+    v |= hvm_get_32u(h);
+
+    return v;
+}
+
+static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int len)
+{
+    memcpy(buf, &h->data[h->cur], len);
+    h->cur += len;
+}
+
+extern int hvm_save(struct vcpu*, hvm_domain_context_t *h);
+extern int hvm_load(struct vcpu*, hvm_domain_context_t *h);
+
+extern void shpage_init(shared_iopage_t *sp);
+
 extern int hvm_enabled;
 
 enum { HVM_COPY_IN = 0, HVM_COPY_OUT };
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/arch-x86_32.h  Wed Jul 19 16:09:59 2006 +0800
@@ -142,6 +142,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
 
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
@@ -174,6 +181,7 @@ struct vcpu_guest_context {
     unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
     unsigned long failsafe_callback_eip;
     unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/arch-x86_64.h  Wed Jul 19 16:09:59 2006 +0800
@@ -212,6 +212,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 #undef __DECL_REG
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
 
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
@@ -249,6 +256,7 @@ struct vcpu_guest_context {
     uint64_t      fs_base;
     uint64_t      gs_base_kernel;
     uint64_t      gs_base_user;
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Wed Jul 19 13:45:04 2006 +0800
+++ b/xen/include/public/dom0_ops.h     Wed Jul 19 16:09:59 2006 +0800
@@ -535,6 +535,31 @@ struct dom0_settimeoffset {
 };
 typedef struct dom0_settimeoffset dom0_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_settimeoffset_t);
+
+#define HVM_CTXT_SIZE        4096
+typedef struct hvm_domain_context {
+    uint32_t cur;
+    uint32_t size;
+    uint8_t data[HVM_CTXT_SIZE];
+} hvm_domain_context_t;
+DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t);
+#define DOM0_GETHVMCONTEXT   51
+typedef struct dom0_gethvmcontext {
+    /* IN variables. */
+    domid_t  domain;                  /* domain to be affected */
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(hvm_domain_context_t) hvm_ctxt;
+} dom0_gethvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_gethvmcontext_t);
+
+#define DOM0_SETHVMCONTEXT   52
+typedef struct dom0_sethvmcontext {
+    /* IN variables. */
+    domid_t  domain;                  /* domain to be affected */
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(hvm_domain_context_t) hvm_ctxt;
+} dom0_sethvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_sethvmcontext_t);
 
 struct dom0_op {
     uint32_t cmd;
@@ -579,6 +604,8 @@ struct dom0_op {
         struct dom0_hypercall_init    hypercall_init;
         struct dom0_domain_setup      domain_setup;
         struct dom0_settimeoffset     settimeoffset;
+        struct dom0_gethvmcontext     gethvmcontext;
+        struct dom0_sethvmcontext     sethvmcontext;
         uint8_t                       pad[128];
     } u;
 };
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_restore.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_restore.c      Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,264 @@
+/******************************************************************************
+ * xc_hvm_restore.c
+ *
+ * Restore the state of a HVM guest.
+ *
+ * Copyright (c) 2006 Intel Corperation
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/ioreq.h>
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
+
+int xc_hvm_restore(int xc_handle, int io_fd,
+                     uint32_t dom, unsigned long nr_pfns,
+                     unsigned int store_evtchn, unsigned long *store_mfn,
+                     unsigned int console_evtchn, unsigned long *console_mfn)
+{
+    DECLARE_DOM0_OP;
+
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    char *region_base;
+
+    xc_mmu_t *mmu = NULL;
+
+    xc_dominfo_t info;
+    unsigned int rc = 1, i;
+    uint32_t rec_len;
+    hvm_domain_context_t hvm_ctxt;
+    unsigned int vp_eport;
+    unsigned long count;
+    unsigned long long ptr;
+    unsigned long long v_end, memsize;
+    unsigned long shared_page_frame = 0;
+    shared_iopage_t *sp;
+
+    /* hvm guest mem size (Mb) */
+    memsize = (unsigned long long)*store_mfn;
+    v_end = memsize << 20;
+
+    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, 
*store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld.\n", 
+            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, 
*console_mfn);
+
+
+    max_pfn = nr_pfns;
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("Unable to get platform info.");
+        return 1;
+    }
+
+    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, 
hvirt_start=%lx, pt_levels=%d\n",
+            max_pfn,
+            max_mfn,
+            hvirt_start,
+            pt_levels);
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        /* needed for build dom0 op, but might as well do early */
+        ERR("Unable to mlock ctxt");
+        return 1;
+    }
+
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    p2m        = calloc(max_pfn, sizeof(unsigned long));
+
+    if (p2m == NULL) {
+        ERR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    op.cmd = DOM0_GETDOMAININFO;
+    op.u.getdomaininfo.domain = (domid_t)dom;
+    if (xc_dom0_op(xc_handle, &op) < 0) {
+        ERR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = op.u.getdomaininfo.shared_info_frame;
+
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    if(xc_domain_memory_increase_reservation(
+           xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
+        ERR("Failed to increase reservation by %lx KB", PFN_TO_KB(max_pfn));
+        errno = ENOMEM;
+        goto out;
+    }
+
+    DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn));
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERR("Could not get domain info");
+        return 1;
+    }
+
+    DPRINTF("after increasing domain reservation, nr_pages=0x%lx, 
maxmemkb=0x%lx\n", info.nr_pages, info.max_memkb);
+
+    /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
+    if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
+        ERR("Did not read correct number of frame numbers for new dom");
+        goto out;
+    }
+
+    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
+        ERR("Could not initialise for MMU updates");
+        goto out;
+    }
+
+    /* resotre memory */
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, p2m, max_pfn) ) == 0) {
+        ERR("HVM:map page_array failed!\n");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfn; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if (!read_exact(io_fd, zpage, PAGE_SIZE)) {
+            ERR("HVM:read page %d failed!\n", i);
+            goto out;
+        }
+    }
+
+    /* Write the machine->phys table entries. */
+    for ( count = 0; count < max_pfn; count++ )
+    {
+        ptr = (unsigned long long)p2m[count] << PAGE_SHIFT;
+        if ( xc_add_mmu_update(xc_handle, mmu,
+                               ptr | MMU_MACHPHYS_UPDATE, count) )
+            goto out;
+    }
+
+    (void)munmap(region_base, max_pfn*PAGE_SIZE);
+
+    if (xc_finish_mmu_updates(xc_handle, mmu)) {
+        ERR("HVM:Error doing finish_mmu_updates()");
+        goto out;
+    }
+
+    /* realloc a evtchn port on vcpu */
+    vp_eport = xc_evtchn_alloc_unbound(xc_handle, dom, 0);
+    if ( vp_eport < 0 ) {
+        ERR("Couldn't get unbound port from VMX guest when restore.\n");
+        goto out;
+    }
+
+    /* restore hvm context including pic/pit/shpage */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERR("error read hvm context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(hvm_ctxt)) {
+        ERR("hvm context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERR("error read hvm context!\n");
+        goto out;
+    }
+
+    xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt);
+
+    /* Populate the event channel port in the shared page */
+    shared_page_frame = p2m[(v_end >> PAGE_SHIFT) - 1];
+    if ( (sp = (shared_iopage_t *) xc_map_foreign_range(
+              xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
+              shared_page_frame)) == 0 ) {
+        ERR("map share page fail");
+        goto out;
+    }
+
+    /* set new vp_eport */
+    DPRINTF("new vp_eport=%d.\n", 
+            vp_eport);
+    /*XXX: smp support */
+    sp->vcpu_iodata[0].vp_eport = vp_eport;
+
+    /* restore vcpu ctxt & vmcs */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERR("error read vcpu context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(ctxt)) {
+        ERR("vcpu context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
+        ERR("error read vcpu context.\n");
+        goto out;
+    }
+
+    if ( (rc = xc_vcpu_setcontext(xc_handle, dom, 0, &ctxt)) ) {
+        ERR("Could not set vcpu context, rc=%d", rc);
+        goto out;
+    }
+
+    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
+    *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2];
+    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", 
*store_mfn, v_end);
+
+    rc = 0;
+    goto out;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(mmu);
+    free(p2m);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+
+    return rc;
+}
diff -r 98c3ddf83a59 -r 2abb1c801ab7 tools/libxc/xc_hvm_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_save.c Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,207 @@
+/******************************************************************************
+ * xc_hvm_save.c
+ *
+ * Save the state of a running HVM guest.
+ *
+ * Copyright (c) 2006 Intel Corperation
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+{
+    xc_dominfo_t info;
+
+    int rc = 1, i;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A copy of hvm domain context */
+    hvm_domain_context_t hvm_ctxt;
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    uint32_t nr_pfns, rec_size;
+    unsigned long *page_array;
+
+    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x.\n",
+            dom, max_iters, max_factor, flags);
+
+    /* If no explicit control parameters given, use defaults */
+/*    if(!max_iters)*/
+/*        max_iters = DEF_MAX_ITERS;*/
+/*    if(!max_factor)*/
+/*        max_factor = DEF_MAX_FACTOR;*/
+
+/*    initialize_mbit_rate();*/
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("HVM:Unable to get platform info.");
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERR("HVM:Could not get domain info");
+        return 1;
+    }
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        ERR("HVM:Unable to mlock ctxt");
+        return 1;
+    }
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERR("HVM:Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* A cheesy test to see whether the domain contains valid state. */
+    if (ctxt.ctrlreg[3] == 0)
+    {
+        ERR("Domain is not in a valid HVM guest state");
+        goto out;
+    }
+
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERR("Invalid HVM state record -- pfn count out of range: %lu",
+            (info.max_memkb >> (PAGE_SHIFT - 10)));
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                            PROT_READ, shared_info_frame))) {
+        ERR("HVM:Couldn't map live_shinfo");
+        goto out;
+    }
+
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    DPRINTF("saved hvm domain info:max_pfn=0x%lx, max_mfn=0x%lx, 
nr_pages=0x%lx\n", max_pfn, max_mfn, info.nr_pages); 
+
+    if (live) {
+        ERR("hvm domain doesn't support live migration now.\n");
+        if (debug)
+            ERR("hvm domain debug on.\n");
+        goto out;
+    }
+
+    /* suspend hvm domain */
+    if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
+        ERR("HVM Domain appears not to have suspended");
+        goto out;
+    }
+
+    nr_pfns = info.nr_pages;
+    DPRINTF("after suspend hvm domain nr_pages=0x%x.\n", nr_pfns);
+
+    /* get all the HVM domain pfns */
+    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * 
nr_pfns)) == NULL) {
+        ERR("HVM:malloc fail!\n");
+        goto out;
+    }
+
+    if ( xc_get_pfn_list(xc_handle, dom, page_array, nr_pfns) != nr_pfns) {
+        ERR("HVM domain get pfn list fail!\n");
+        goto out;
+    }
+
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, page_array, nr_pfns) ) == 0) {
+        ERR("HVM domain map pages failed!\n");
+        goto out;
+    }
+
+
+    /* Start writing out the saved-domain record. begin with mem */
+    if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) {
+        ERR("write: nr_pfns");
+        goto out;
+    }
+
+    for (i = 0; i < nr_pfns; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if (ratewrite(io_fd, zpage, PAGE_SIZE) != PAGE_SIZE) {
+            ERR("HVM:write page %d failed!.\n", i);
+            goto out;
+        }
+    }
+
+    /* save hvm hypervisor state including pic/pit/shpage */
+    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERR("Unable to mlock ctxt");
+        return 1;
+    }
+    xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt);
+
+/*    ERR("hvm_getcontext get %d, size=%d!\n", hvm_ctxt.size, 
sizeof(hvm_ctxt));*/
+    rec_size = sizeof(hvm_ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERR("error write hvm ctxt size");
+        goto out;
+    }
+
+    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
+        ERR("write HVM info failed!\n");
+    }
+
+
+    /* save vcpu/vmcs context XXX:smp support*/
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERR("HVM:Could not get vcpu context");
+        goto out;
+    }
+
+    rec_size = sizeof(ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERR("error write vcpu ctxt size");
+        goto out;
+    }
+
+    if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
+        ERR("write vmcs failed!\n");
+
+
+
+    /* Success! */
+    rc = 0;
+
+ out:
+    return !!rc;
+}
diff -r 98c3ddf83a59 -r 2abb1c801ab7 xen/include/public/vmcs_data.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/vmcs_data.h    Wed Jul 19 16:09:59 2006 +0800
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * vmcs_data.h
+ * 
+ * Copyright (c) 2006 Intel Corperation
+ * 
+ */
+
+#ifndef __XEN_PUBLIC_VMCS_DATA_H__
+#define __XEN_PUBLIC_VMCS_DATA_H__
+
+/*
+ * World vmcs state
+ */
+struct vmcs_data {
+    uint64_t  eip;        /* execution pointer */
+    uint64_t  esp;        /* stack pointer */
+    uint64_t  eflags;     /* flags register */
+    uint64_t  cr0;
+    uint64_t  cr3;        /* page table directory */
+    uint64_t  cr4;
+    uint32_t  idtr_limit; /* idt */
+    uint64_t  idtr_base;
+    uint32_t  gdtr_limit; /* gdt */
+    uint64_t  gdtr_base;
+    uint32_t  cs_sel;     /* cs selector */
+    uint32_t  cs_limit;
+    uint64_t  cs_base;
+    uint32_t  cs_arbytes;
+    uint32_t  ds_sel;     /* ds selector */
+    uint32_t  ds_limit;
+    uint64_t  ds_base;
+    uint32_t  ds_arbytes;
+    uint32_t  es_sel;     /* es selector */
+    uint32_t  es_limit;
+    uint64_t  es_base;
+    uint32_t  es_arbytes;
+    uint32_t  ss_sel;     /* ss selector */
+    uint32_t  ss_limit;
+    uint64_t  ss_base;
+    uint32_t  ss_arbytes;
+    uint32_t  fs_sel;     /* fs selector */
+    uint32_t  fs_limit;
+    uint64_t  fs_base;
+    uint32_t  fs_arbytes;
+    uint32_t  gs_sel;     /* gs selector */
+    uint32_t  gs_limit;
+    uint64_t  gs_base;
+    uint32_t  gs_arbytes;
+    uint32_t  tr_sel;     /* task selector */
+    uint32_t  tr_limit;
+    uint64_t  tr_base;
+    uint32_t  tr_arbytes;
+    uint32_t  ldtr_sel;   /* ldtr selector */
+    uint32_t  ldtr_limit;
+    uint64_t  ldtr_base;
+    uint32_t  ldtr_arbytes;
+    uint32_t  sysenter_cs;
+    uint64_t  sysenter_esp;
+    uint64_t  sysenter_eip;
+    /* msr for em64t */
+    uint64_t shadow_gs;
+    uint64_t flags;
+    /* same size as VMX_MSR_COUNT */
+    uint64_t msr_items[6];
+    uint64_t cpu_state;
+};
+typedef struct vmcs_data vmcs_data_t;
+#endif
# HG changeset patch
# User Edwin Zhai <edwin.zhai@xxxxxxxxx>
# Node ID 98c3ddf83a59b0cbbdce63bb210adfd0d2ec1aea
# Parent  ecb8ff1fcf1fc24561c8bd272a58828592d90806
cirrus&rtl8139 coexist issue fix

diff -r ecb8ff1fcf1f -r 98c3ddf83a59 tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c      Fri Jul 14 18:53:27 2006 +0100
+++ b/tools/ioemu/target-i386-dm/exec-dm.c      Wed Jul 19 13:45:04 2006 +0800
@@ -382,7 +382,7 @@ int iomem_index(target_phys_addr_t addr)
                 start = mmio[i].start;
                 end = mmio[i].start + mmio[i].size;
 
-                if ((addr >= start) && (addr <= end)){
+                if ((addr >= start) && (addr < end)){
                         return (mmio[i].io_index >> IO_MEM_SHIFT) & 
(IO_MEM_NB_ENTRIES - 1);
                 }
         }
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH][HVM] vmx domain save/restore support, Zhai, Edwin <=