WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 6/8] HVM save restore: guest memory handling

To: Ian Pratt <Ian.Pratt@xxxxxxxxxxxx>, Keir Fraser <Keir.Fraser@xxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 6/8] HVM save restore: guest memory handling
From: "Zhai, Edwin" <edwin.zhai@xxxxxxxxx>
Date: Thu, 11 Jan 2007 22:11:51 +0800
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx, edwin.zhai@xxxxxxxxx
Delivery-date: Thu, 11 Jan 2007 06:13:52 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.5.11
[PATCH 6/8] HVM save restore: guest memory handling

Signed-off-by: Zhai Edwin <edwin.zhai@xxxxxxxxx>

add support for save/restore HVM guest memory


diff -r bb1c450b2739 tools/libxc/xc_hvm_restore.c
--- a/tools/libxc/xc_hvm_restore.c      Thu Jan 11 21:03:11 2007 +0800
+++ b/tools/libxc/xc_hvm_restore.c      Thu Jan 11 21:05:45 2007 +0800
@@ -31,6 +31,40 @@
 #include <xen/hvm/ioreq.h>
 #include <xen/hvm/params.h>
 #include <xen/hvm/e820.h>
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
 
 int xc_hvm_restore(int xc_handle, int io_fd,
                      uint32_t dom, unsigned long nr_pfns,
@@ -38,5 +72,289 @@ int xc_hvm_restore(int xc_handle, int io
                      unsigned int console_evtchn, unsigned long *console_mfn,
                      unsigned int pae, unsigned int apic)
 {
-    return 0;
+    DECLARE_DOMCTL;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    char *region_base;
+
+    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+
+    xc_dominfo_t info;
+    unsigned int rc = 1, n, i;
+    uint32_t rec_len, nr_vcpus;
+    hvm_domain_context_t hvm_ctxt;
+    unsigned long long v_end, memsize;
+    unsigned long shared_page_nr;
+
+    unsigned long mfn, pfn;
+    unsigned int prev_pc, this_pc;
+    int verify = 0;
+
+    /* Types of the pfns in the current region */
+    unsigned long region_pfn_type[MAX_BATCH_SIZE];
+
+    /* hvm guest mem size (Mb) */
+    memsize = (unsigned long long)*store_mfn;
+    v_end = memsize << 20;
+
+    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, 
*store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n", 
+            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, 
*console_mfn, pae, apic);
+
+    max_pfn = nr_pfns;
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("Unable to get platform info.");
+        return 1;
+    }
+
+    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, 
hvirt_start=%lx, pt_levels=%d\n",
+            max_pfn,
+            max_mfn,
+            hvirt_start,
+            pt_levels);
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        /* needed for build dom0 op, but might as well do early */
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+
+    p2m        = malloc(max_pfn * sizeof(xen_pfn_t));
+
+    if (p2m == NULL) {
+        ERROR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
+
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++ )
+        p2m[i] = i;
+    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ )
+        p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
+    rc = xc_domain_memory_populate_physmap(
+        xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn,
+        0, 0, &p2m[0x00]);
+    if ( (rc == 0) && (max_pfn > 0xc0) )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]);
+    if ( rc != 0 )
+    {
+        PERROR("Could not allocate memory for HVM guest.\n");
+        goto out;
+    }
+
+
+    /**********XXXXXXXXXXXXXXXX******************/
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++)
+        p2m[i] = i;
+
+    prev_pc = 0;
+
+    n = 0;
+    while (1) {
+
+        int j;
+
+        this_pc = (n * 100) / max_pfn;
+        if ( (this_pc - prev_pc) >= 5 )
+        {
+            PPRINTF("\b\b\b\b%3d%%", this_pc);
+            prev_pc = this_pc;
+        }
+
+        if (!read_exact(io_fd, &j, sizeof(int))) {
+            ERROR("HVM restore Error when reading batch size");
+            goto out;
+        }
+
+        PPRINTF("batch %d\n",j);
+
+        if (j == -1) {
+            verify = 1;
+            DPRINTF("Entering page verify mode\n");
+            continue;
+        }
+
+        if (j == 0)
+            break;  /* our work here is done */
+
+        if (j > MAX_BATCH_SIZE) {
+            ERROR("Max batch size exceeded. Giving up.");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
+            ERROR("Error when reading region pfn types");
+            goto out;
+        }
+
+        region_base = xc_map_foreign_batch(
+            xc_handle, dom, PROT_WRITE, region_pfn_type, j);
+
+        for ( i = 0; i < j; i++ )
+        {
+            void *page;
+
+            pfn = region_pfn_type[i];
+            if ( pfn > max_pfn )
+            {
+                ERROR("pfn out of range");
+                goto out;
+            }
+
+            if ( pfn >= 0xa0 && pfn < 0xc0) {
+                ERROR("hvm restore:pfn in vga hole");
+                goto out;
+            }
+
+
+            mfn = p2m[pfn];
+
+            /* In verify mode, we use a copy; otherwise we work in place */
+            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+            if (!read_exact(io_fd, page, PAGE_SIZE)) {
+                ERROR("Error when reading page (%x)", i);
+                goto out;
+            }
+
+            if (verify) {
+
+                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+
+                if (res) {
+
+                    int v;
+
+                    DPRINTF("************** pfn=%lx mfn=%lx gotcs=%08lx "
+                            "actualcs=%08lx\n", pfn, p2m[pfn],
+                            csum_page(region_base + i*PAGE_SIZE),
+                            csum_page(buf));
+
+                    for (v = 0; v < 4; v++) {
+
+                        unsigned long *p = (unsigned long *)
+                            (region_base + i*PAGE_SIZE);
+                        if (buf[v] != p[v])
+                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
+                    }
+                }
+            }
+
+        } /* end of 'batch' for loop */
+        munmap(region_base, j*PAGE_SIZE);
+        n+= j; /* crude stats */
+
+    }/*while 1*/
+    
+/*    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn);
+
+    if ( v_end > HVM_BELOW_4G_RAM_END )
+        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
+    else
+        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+
+    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
+    *store_mfn = (v_end >> PAGE_SHIFT) - 2;
+    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", 
*store_mfn, v_end);
+
+    /* restore hvm context including pic/pit/shpage */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERROR("error read hvm context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(hvm_ctxt)) {
+        ERROR("hvm context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("error read hvm context!\n");
+        goto out;
+    }
+
+    if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) {
+        ERROR("error set hvm context!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error read nr vcpu !\n");
+        goto out;
+    }
+    DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
+
+    for (i =0; i < nr_vcpus; i++) {
+        if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+            ERROR("error read vcpu context size!\n");
+            goto out;
+        }
+        if (rec_len != sizeof(ctxt)) {
+            ERROR("vcpu context size dismatch!\n");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
+            ERROR("error read vcpu context.\n");
+            goto out;
+        }
+
+        if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) {
+            ERROR("Could not set vcpu context, rc=%d", rc);
+            goto out;
+        }
+    }
+
+    rc = 0;
+    goto out;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(p2m);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+
+    return rc;
 }
diff -r bb1c450b2739 tools/libxc/xc_hvm_save.c
--- a/tools/libxc/xc_hvm_save.c Thu Jan 11 21:03:11 2007 +0800
+++ b/tools/libxc/xc_hvm_save.c Thu Jan 11 21:05:10 2007 +0800
@@ -32,9 +32,696 @@
 #include "xg_private.h"
 #include "xg_save_restore.h"
 
+/*
+** Default values for important tuning parameters. Can override by passing
+** non-zero replacement values to xc_hvm_save().
+**
+** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
+**
+*/
+#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
+#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / 8)
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline int permute( int i, int nr, int order_nr  )
+{
+    /* Need a simple permutation function so that we scan pages in a
+       pseudo random order, enabling us to get a better estimate of
+       the domain's page dirtying rate as we go (there are often
+       contiguous ranges of pfns that have similar behaviour, and we
+       want to mix them up. */
+
+    /* e.g. nr->oder 15->4 16->4 17->5 */
+    /* 512MB domain, 128k pages, order 17 */
+
+    /*
+      QPONMLKJIHGFEDCBA
+             QPONMLKJIH
+      GFEDCBA
+     */
+
+    /*
+      QPONMLKJIHGFEDCBA
+                  EDCBA
+             QPONM
+      LKJIHGF
+      */
+
+    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
+    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
+
+    return i;
+}
+
+static uint64_t tv_to_us(struct timeval *new)
+{
+    return (new->tv_sec * 1000000) + new->tv_usec;
+}
+
+static uint64_t llgettimeofday(void)
+{
+    struct timeval now;
+    gettimeofday(&now, NULL);
+    return tv_to_us(&now);
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+    return ((new->tv_sec - old->tv_sec)*1000000 ) +
+        (new->tv_usec - old->tv_usec);
+}
+
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count)
+        return 0;
+    return 1;
+}
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+                       xc_shadow_op_stats_t *stats, int print)
+{
+    static struct timeval wall_last;
+    static long long      d0_cpu_last;
+    static long long      d1_cpu_last;
+
+    struct timeval        wall_now;
+    long long             wall_delta;
+    long long             d0_cpu_now, d0_cpu_delta;
+    long long             d1_cpu_now, d1_cpu_delta;
+
+    gettimeofday(&wall_now, NULL);
+
+    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+        DPRINTF("ARRHHH!!\n");
+
+    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+
+    if (wall_delta == 0) wall_delta = 1;
+
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if (print)
+        DPRINTF(
+                "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+                "dirtied %dMb/s %" PRId32 " pages\n",
+                wall_delta,
+                (int)((d0_cpu_delta*100)/wall_delta),
+                (int)((d1_cpu_delta*100)/wall_delta),
+                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+                stats->dirty_count);
+
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now;
+
+    return 0;
+}
+
+static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
+                          unsigned long *arr, int runs)
+{
+    long long start, now;
+    xc_shadow_op_stats_t stats;
+    int j;
+
+    start = llgettimeofday();
+
+    for (j = 0; j < runs; j++) {
+        int i;
+
+        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+                          arr, max_pfn, NULL, 0, NULL);
+        DPRINTF("#Flush\n");
+        for ( i = 0; i < 40; i++ ) {
+            usleep(50000);
+            now = llgettimeofday();
+            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+                              NULL, 0, NULL, 0, &stats);
+
+            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
+                    ((now-start)+500)/1000,
+                    stats.fault_count, stats.dirty_count);
+        }
+    }
+
+    return -1;
+}
+
+static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt)
+{
+    int i = 0;
+
+    if (!(*suspend)(dom)) {
+        ERROR("Suspend request failed");
+        return -1;
+    }
+
+ retry:
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
+        ERROR("Could not get domain info");
+        return -1;
+    }
+
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
+        ERROR("Could not get vcpu context");
+
+
+    if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
+        return 0; // success
+
+    if (info->paused) {
+        // try unpausing domain, wait, and retest
+        xc_domain_unpause( xc_handle, dom );
+
+        ERROR("Domain was paused. Wait and re-test.");
+        usleep(10000);  // 10ms
+
+        goto retry;
+    }
+
+
+    if( ++i < 100 ) {
+        ERROR("Retry suspend domain.");
+        usleep(10000);  // 10ms
+        goto retry;
+    }
+
+    ERROR("Unable to suspend domain.");
+
+    return -1;
+}
+
 int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                   uint32_t max_factor, uint32_t flags, int (*suspend)(int))
 {
-
-    return 0;
-}
+    xc_dominfo_t info;
+
+    int rc = 1, i, last_iter, iter = 0;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+    int sent_last_iter, skip_this_iter;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A table containg the type of each PFN (/not/ MFN!). */
+    unsigned long *pfn_type = NULL;
+    unsigned long *pfn_batch = NULL;
+
+    /* A copy of hvm domain context */
+    hvm_domain_context_t hvm_ctxt;
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    uint32_t nr_pfns, rec_size, nr_vcpus;
+    unsigned long *page_array = NULL;
+
+    /* power of 2 order of max_pfn */
+    int order_nr;
+
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip);
+       - to skip this iteration because already dirty; */
+    unsigned long *to_send = NULL, *to_skip = NULL;
+
+    xc_shadow_op_stats_t stats;
+
+    unsigned long total_sent    = 0;
+
+    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, 
live=%d, debug=%d.\n",
+            dom, max_iters, max_factor, flags,
+            live, debug);
+
+    /* If no explicit control parameters given, use defaults */
+    if(!max_iters)
+        max_iters = DEF_MAX_ITERS;
+    if(!max_factor)
+        max_factor = DEF_MAX_FACTOR;
+
+    initialize_mbit_rate();
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("HVM:Unable to get platform info.");
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("HVM:Could not get domain info");
+        return 1;
+    }
+    nr_vcpus = info.nr_online_vcpus;
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        ERROR("HVM:Unable to mlock ctxt");
+        return 1;
+    }
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERROR("HVM:Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* A cheesy test to see whether the domain contains valid state. */
+    if (ctxt.ctrlreg[3] == 0)
+    {
+        ERROR("Domain is not in a valid HVM guest state");
+        goto out;
+    }
+
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERROR("Invalid HVM state record -- pfn count out of range: %lu",
+            (info.max_memkb >> (PAGE_SHIFT - 10)));
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                            PROT_READ, shared_info_frame))) {
+        ERROR("HVM:Couldn't map live_shinfo");
+        goto out;
+    }
+
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, 
nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); 
+
+    /* nr_pfns: total pages excluding vga acc mem
+     * max_pfn: nr_pfns + 0x20 vga hole(0xa0~0xc0)
+     * getdomaininfo.tot_pages: all the allocated pages for this domain
+     */
+    if (live) {
+        ERROR("hvm domain doesn't support live migration now.\n");
+        goto out;
+
+        if (xc_shadow_control(xc_handle, dom,
+                              XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                              NULL, 0, NULL, 0, NULL) < 0) {
+            ERROR("Couldn't enable shadow mode");
+            goto out;
+        }
+
+        /* excludes vga acc mem */
+        nr_pfns = info.nr_pages - 0x800;
+
+        last_iter = 0;
+        DPRINTF("hvm domain live migration debug start: logdirty enable.\n");
+    } else {
+        /* This is a non-live suspend. Issue the call back to get the
+           domain suspended */
+
+        last_iter = 1;
+
+        /* suspend hvm domain */
+        if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
+            ERROR("HVM Domain appears not to have suspended");
+            goto out;
+        }
+        nr_pfns = info.nr_pages;
+        DPRINTF("after suspend hvm domain nr_pages=0x%x.\n", nr_pfns);
+    }
+
+    DPRINTF("after 1st handle hvm domain nr_pfns=0x%x, nr_pages=0x%lx, 
max_memkb=0x%lx, live=%d.\n",
+            nr_pfns,
+            info.nr_pages,
+            info.max_memkb,
+            live);
+
+    nr_pfns = info.nr_pages;
+
+    /*XXX: caculate the VGA hole*/
+    max_pfn = nr_pfns + 0x20;
+
+    skip_this_iter = 0;/*XXX*/
+    /* pretend we sent all the pages last iteration */
+    sent_last_iter = max_pfn;
+
+    /* calculate the power of 2 order of max_pfn, e.g.
+       15->4 16->4 17->5 */
+    for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
+        continue;
+
+    /* Setup to_send / to_fix and to_skip bitmaps */
+    to_send = malloc(BITMAP_SIZE);
+    to_skip = malloc(BITMAP_SIZE);
+
+    if (!to_send ||!to_skip) {
+        ERROR("Couldn't allocate to_send array");
+        goto out;
+    }
+
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if (lock_pages(to_send, BITMAP_SIZE)) {
+        ERROR("Unable to lock to_send");
+        return 1;
+    }
+
+    /* (to fix is local only) */
+    if (lock_pages(to_skip, BITMAP_SIZE)) {
+        ERROR("Unable to lock to_skip");
+        return 1;
+    }
+
+    analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
+
+    /* get all the HVM domain pfns */
+    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * 
max_pfn)) == NULL) {
+        ERROR("HVM:malloc fail!\n");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++)
+        page_array[i] = i;
+
+
+    /* We want zeroed memory so use calloc rather than malloc. */
+    pfn_type  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
+    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
+
+    if ((pfn_type == NULL) || (pfn_batch == NULL)) {
+        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
+        ERROR("Unable to lock");
+        goto out;
+    }
+
+    /* Start writing out the saved-domain record. */
+    if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
+        ERROR("write: max_pfn");
+        goto out;
+    }
+
+    while(1) {
+
+        unsigned int prev_pc, sent_this_iter, N, batch;
+
+        iter++;
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+        prev_pc = 0;
+        N=0;
+
+        DPRINTF("Saving HVM domain memory pages: iter %d   0%%", iter);
+
+        while( N < max_pfn ){
+
+            unsigned int this_pc = (N * 100) / max_pfn;
+
+            if ((this_pc - prev_pc) >= 5) {
+                DPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            /* slightly wasteful to peek the whole array evey time,
+               but this is fast enough for the moment. */
+            if (!last_iter && xc_shadow_control(
+                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
+                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
+                ERROR("Error peeking HVM shadow bitmap");
+                goto out;
+            }
+
+
+            /* load pfn_type[] with the mfn of all the pages we're doing in
+               this batch. */
+            for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
+
+                int n = permute(N, max_pfn, order_nr);
+
+                if (debug) {
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d \n",
+                            iter, (unsigned long)n, page_array[n],
+                            test_bit(n, to_send));
+                }
+
+                if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
+                    skip_this_iter++; /* stats keeping */
+
+                if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+                      (test_bit(n, to_send) && last_iter)))
+                    continue;
+
+                if (n >= 0xa0 && n < 0xc0) {
+/*                    DPRINTF("get a vga hole pfn= %x.\n", n);*/
+                    continue;
+                }
+                /*
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. (ignore to_skip in last iteration)
+                */
+
+                pfn_batch[batch] = n;
+                pfn_type[batch]  = page_array[n];
+
+                batch++;
+            }
+
+            if (batch == 0)
+                goto skip; /* vanishingly unlikely... */
+
+            /* map_foreign use pfns now !*/
+            if ((region_base = xc_map_foreign_batch(
+                     xc_handle, dom, PROT_READ, pfn_batch, batch)) == 0) {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            /* write num of pfns */
+            if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
+                ERROR("Error when writing to state file (2)");
+                goto out;
+            }
+
+            /* write all the pfns */
+            if(!write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch)) {
+                ERROR("Error when writing to state file (3)");
+                goto out;
+            }
+
+            if (ratewrite(io_fd, region_base, PAGE_SIZE * batch) != PAGE_SIZE 
* batch) {
+                ERROR("ERROR when writting to state file (4)");
+                goto out;
+            }
+
+
+            sent_this_iter += batch;
+
+            munmap(region_base, batch*PAGE_SIZE);
+
+        } /* end of this while loop for this iteration */
+
+      skip:
+
+        total_sent += sent_this_iter;
+
+        DPRINTF("\r %d: sent %d, skipped %d, ",
+                iter, sent_this_iter, skip_this_iter );
+
+        if (last_iter) {
+            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+            DPRINTF("Total pages sent= %ld (%.2fx)\n",
+                    total_sent, ((float)total_sent)/max_pfn );
+        }
+
+        if (last_iter && debug){
+            int minusone = -1;
+            memset(to_send, 0xff, BITMAP_SIZE);
+            debug = 0;
+            DPRINTF("Entering debug resend-all mode\n");
+
+            /* send "-1" to put receiver into debug mode */
+            if(!write_exact(io_fd, &minusone, sizeof(int))) {
+                ERROR("Error when writing to state file (6)");
+                goto out;
+            }
+
+            continue;
+        }
+
+        if (last_iter) break;
+
+        if (live) {
+
+
+            if(
+                ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
+                (iter >= max_iters) ||
+                (sent_this_iter+skip_this_iter < 50) ||
+                (total_sent > max_pfn*max_factor) ) {
+
+                DPRINTF("Start last iteration for HVM domain\n");
+                last_iter = 1;
+
+                if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
+                                      &ctxt)) {
+                    ERROR("Domain appears not to have suspended");
+                    goto out;
+                }
+
+                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
+                        info.shared_info_frame,
+                        (unsigned long)ctxt.user_regs.eip,
+                        (unsigned long)ctxt.user_regs.edx);
+            }
+
+            if (xc_shadow_control(xc_handle, dom, 
+                                  XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
+                                  max_pfn, NULL, 0, &stats) != max_pfn) {
+                ERROR("Error flushing shadow PT");
+                goto out;
+            }
+
+            sent_last_iter = sent_this_iter;
+
+            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
+
+        }
+
+
+    } /* end of while 1 */
+
+
+    DPRINTF("All HVM memory is saved\n");
+
+    /* Zero terminate */
+    i = 0;
+    if (!write_exact(io_fd, &i, sizeof(int))) {
+        ERROR("Error when writing to state file (6)");
+        goto out;
+    }
+
+    /* save hvm hypervisor state including pic/pit/shpage */
+    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+    if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){
+        ERROR("HVM:Could not get hvm context");
+        goto out;
+    }
+
+    rec_size = sizeof(hvm_ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERROR("error write hvm ctxt size");
+        goto out;
+    }
+
+    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
+        ERROR("write HVM info failed!\n");
+    }
+
+    /* save vcpu/vmcs context */
+    if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error write nr vcpus");
+        goto out;
+    }
+
+    /*XXX: need a online map to exclude down cpu */
+    for (i = 0; i < nr_vcpus; i++) {
+
+        if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
+            ERROR("HVM:Could not get vcpu context");
+            goto out;
+        }
+
+        rec_size = sizeof(ctxt);
+        DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); 
+        if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+            ERROR("error write vcpu ctxt size");
+            goto out;
+        }
+
+        if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
+            ERROR("write vmcs failed!\n");
+            goto out;
+        }
+    }
+ 
+    /* Success! */
+    rc = 0;
+
+ out:
+
+    if (live) {
+        if(xc_shadow_control(xc_handle, dom, 
+                             XEN_DOMCTL_SHADOW_OP_OFF,
+                             NULL, 0, NULL, 0, NULL) < 0) {
+            DPRINTF("Warning - couldn't disable shadow mode");
+        }
+    }
+
+    free(page_array);
+
+    free(pfn_type);
+    free(pfn_batch);
+    free(to_send);
+    free(to_skip);
+
+    return !!rc;
+}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 6/8] HVM save restore: guest memory handling, Zhai, Edwin <=