WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Enable save/restore for PAE domains.

# HG changeset patch
# User smh22@xxxxxxxxxxxxxxxxxxxx
# Node ID b3c2bc39d8150b5caf8d4310dd0bbca1a25cc93a
# Parent  abbe3df337747e70cc2659e349a6d5f7c9748419
Enable save/restore for PAE domains.

This includes quite a few cleanups / refactoring of the old code, some
of which is intended to prepare for 64-bit save/restore. 

Signed-off-by: Steven Hand <steven@xxxxxxxxxxxxx>

diff -r abbe3df33774 -r b3c2bc39d815 tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c    Tue Nov  8 17:39:58 2005
+++ b/tools/libxc/xc_linux_restore.c    Tue Nov  8 17:42:07 2005
@@ -8,32 +8,30 @@
 
 #include <stdlib.h>
 #include <unistd.h>
+
 #include "xg_private.h"
-#include <xenctrl.h>
-#include <xen/memory.h>
-
-#define MAX_BATCH_SIZE 1024
-
-#define DEBUG 0
-
-#if 1
-#define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } 
while(0)
-#else
-#define ERR(_f, _a...) ((void)0)
-#endif
-
-#if DEBUG
-#define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); 
fflush(stdout); } while (0)
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-#define PROGRESS 0
-#if PROGRESS
-#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
-#else
-#define PPRINTF(_f, _a...)
-#endif
+#include "xg_save_restore.h"
+
+
+
+/* max mfn of the whole machine */
+static uint32_t max_mfn; 
+
+/* virtual starting address of the hypervisor */
+static uint32_t hvirt_start; 
+
+/* #levels of page tables used by the currrent guest */
+static uint32_t pt_levels; 
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static unsigned long *live_p2m = NULL;
+
+/* A table mapping each PFN to its new MFN. */
+static unsigned long *p2m = NULL;
+
 
 static ssize_t
 read_exact(int fd, void *buf, size_t count)
@@ -45,24 +43,93 @@
         s = read(fd, &b[r], count - r);
         if ((s == -1) && (errno == EINTR))
             continue;
-        if (s <= 0)
+        if (s <= 0) { 
             break;
+        } 
         r += s;
     }
 
-    return r;
+    return (r == count) ? 1 : 0; 
 }
 
-int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, unsigned long 
nr_pfns,
+
+/*
+** In the state file (or during transfer), all page-table pages are 
+** converted into a 'canonical' form where references to actual mfns 
+** are replaced with references to the corresponding pfns. 
+** This function inverts that operation, replacing the pfn values with 
+** the (now known) appropriate mfn values. 
+*/
+int uncanonicalize_pagetable(unsigned long type, void *page) 
+{ 
+    int i, pte_last, xen_start, xen_end; 
+    unsigned long pfn; 
+    uint64_t pte; 
+
+    /* 
+    ** We need to determine which entries in this page table hold
+    ** reserved hypervisor mappings. This depends on the current
+    ** page table type as well as the number of paging levels. 
+    */
+    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
+    
+    if (pt_levels == 2 && type == L2TAB)
+        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
+
+    if (pt_levels == 3 && type == L3TAB) 
+        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
+
+
+    /* Now iterate through the page table, uncanonicalizing each PTE */
+    for(i = 0; i < pte_last; i++) { 
+        
+        if(pt_levels == 2) 
+            pte = ((uint32_t *)page)[i]; 
+        else 
+            pte = ((uint64_t *)page)[i]; 
+        
+        if(i >= xen_start && i < xen_end) 
+            pte = 0; 
+        
+        if(pte & _PAGE_PRESENT) { 
+            
+            pfn = pte >> PAGE_SHIFT; 
+            
+            if(pfn >= max_pfn) { 
+                ERR("Frame number in type %lu page table is out of range: "
+                    "i=%d pfn=0x%lx max_pfn=%lu", 
+                    type >> 28, i, pfn, max_pfn);
+                return 0; 
+            } 
+            
+            
+            if(type == L1TAB) 
+                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
+            else 
+                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
+            
+            pte |= p2m[pfn] << PAGE_SHIFT;
+            
+            if(pt_levels == 2) 
+                ((uint32_t *)page)[i] = (uint32_t)pte; 
+            else 
+                ((uint64_t *)page)[i] = (uint64_t)pte; 
+        }
+    }
+    
+    return 1; 
+}
+
+int xc_linux_restore(int xc_handle, int io_fd, 
+                     uint32_t dom, unsigned long nr_pfns, 
                      unsigned int store_evtchn, unsigned long *store_mfn,
                      unsigned int console_evtchn, unsigned long *console_mfn)
 {
     dom0_op_t op;
-    int rc = 1, i, n, k;
-    unsigned long mfn, pfn, xpfn;
+    int rc = 1, i, n;
+    unsigned long mfn, pfn; 
     unsigned int prev_pc, this_pc;
     int verify = 0;
-    int err;
 
     /* The new domain's shared-info frame number. */
     unsigned long shared_info_frame;
@@ -72,29 +139,21 @@
     /* A copy of the CPU context of the guest. */
     vcpu_guest_context_t ctxt;
 
-    /* A table containg the type of each PFN (/not/ MFN!). */
+    /* A table containing the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
 
     /* A table of MFNs to map in the current region */
     unsigned long *region_mfn = NULL;
 
     /* A temporary mapping, and a copy, of one frame of guest memory. */
-    unsigned long *ppage = NULL;
+    unsigned long *page = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long pfn_to_mfn_frame_list[1024];
-
-    /* A table mapping each PFN to its new MFN. */
-    unsigned long *pfn_to_mfn_table = NULL;
-
-    /* used by mapper for updating the domain's copy of the table */
-    unsigned long *live_pfn_to_mfn_table = NULL;
+    unsigned long *p2m_frame_list = NULL; 
 
     /* A temporary mapping of the guest's start_info page. */
     start_info_t *start_info;
 
-    int pt_levels = 2; /* XXX auto-detect this */
-
     char *region_base;
 
     xc_mmu_t *mmu = NULL;
@@ -102,37 +161,60 @@
     /* used by debug verify code */
     unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
 
-#define MAX_PIN_BATCH 1024
     struct mmuext_op pin[MAX_PIN_BATCH];
     unsigned int nr_pins = 0;
 
-    DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
+
+    max_pfn = nr_pfns; 
+
+    DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
+
+
+    if(!get_platform_info(xc_handle, dom, 
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("Unable to get platform info."); 
+        return 1;
+    }
+
 
     if (mlock(&ctxt, sizeof(ctxt))) {
-        /* needed for when we do the build dom0 op, 
-           but might as well do early */
+        /* needed for build dom0 op, but might as well do early */
         ERR("Unable to mlock ctxt");
         return 1;
     }
 
-    if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
-        ERR("read pfn_to_mfn_frame_list failed");
-        goto out;
-    }
-
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) {
+        ERR("Could not get vcpu context");
+        goto out;
+    }
+
+    
+    /* Read the saved P2M frame list */
+    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
+        ERR("Couldn't allocate p2m_frame_list array");
+        goto out;
+    }
+    
+    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
+        ERR("read p2m_frame_list failed");
+        goto out;
+    }
+
+    
     /* We want zeroed memory so use calloc rather than malloc. */
-    pfn_to_mfn_table = calloc(4, nr_pfns);
-    pfn_type = calloc(4, nr_pfns);    
-    region_mfn = calloc(4, MAX_BATCH_SIZE);
-
-    if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) || 
-        (region_mfn == NULL)) {
+    p2m        = calloc(sizeof(unsigned long), max_pfn); 
+    pfn_type   = calloc(sizeof(unsigned long), max_pfn);    
+    region_mfn = calloc(sizeof(unsigned long), MAX_BATCH_SIZE);
+
+    if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
         ERR("memory alloc failed");
         errno = ENOMEM;
         goto out;
     }
     
-    if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
+    if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
         ERR("Could not mlock region_mfn");
         goto out;
     }
@@ -146,35 +228,30 @@
     }
     shared_info_frame = op.u.getdomaininfo.shared_info_frame;
 
-    err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
-    if (err != 0) {
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { 
         errno = ENOMEM;
         goto out;
     }
-
-    err = xc_domain_memory_increase_reservation(xc_handle, dom,
-                                                nr_pfns, 0, 0, NULL);
-    if (err != 0) {
-        ERR("Failed to increase reservation by %lx\n", 
-            nr_pfns * PAGE_SIZE / 1024); 
+    
+    if(xc_domain_memory_increase_reservation(
+           xc_handle, dom, max_pfn, 0, 0, NULL) != 0) { 
+        ERR("Failed to increase reservation by %lx KB\n", max_pfn); 
         errno = ENOMEM;
         goto out;
     }
 
     /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
-    if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
-        nr_pfns) {
+    if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
         ERR("Did not read correct number of frame numbers for new dom");
         goto out;
     }
-
-    mmu = xc_init_mmu_updates(xc_handle, dom);
-    if (mmu == NULL) {
+    
+    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) { 
         ERR("Could not initialise for MMU updates");
         goto out;
     }
 
-    DPRINTF("Reloading memory pages:   0%%");
+    DPRINTF("Reloading memory pages:   0%%\n");
 
     /*
      * Now simply read each saved frame into its new machine frame.
@@ -183,258 +260,229 @@
     prev_pc = 0;
 
     n = 0;
-    while ( 1 )
-    {
+    while (1) { 
+
         int j;
         unsigned long region_pfn_type[MAX_BATCH_SIZE];
 
-        this_pc = (n * 100) / nr_pfns;
+        this_pc = (n * 100) / max_pfn;
         if ( (this_pc - prev_pc) >= 5 )
         {
             PPRINTF("\b\b\b\b%3d%%", this_pc);
             prev_pc = this_pc;
         }
 
-        if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
-        {
+        if (!read_exact(io_fd, &j, sizeof(int))) { 
             ERR("Error when reading batch size");
             goto out;
         }
 
         PPRINTF("batch %d\n",j);
  
-        if ( j == -1 )
-        {
+        if (j == -1) {
             verify = 1;
-            printf("Entering page verify mode\n");
+            fprintf(stderr, "Entering page verify mode\n");
             continue;
         }
 
-        if ( j == 0 )
+        if (j == 0)
             break;  /* our work here is done */
 
-        if ( j > MAX_BATCH_SIZE )
-        {
+        if (j > MAX_BATCH_SIZE) { 
             ERR("Max batch size exceeded. Giving up.");
             goto out;
         }
  
-        if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
-             j*sizeof(unsigned long) ) {
+        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) { 
             ERR("Error when reading region pfn types");
+            goto out;
+        }
+
+        for (i = 0; i < j; i++) { 
+
+            if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
+                region_mfn[i] = 0; /* we know map will fail, but don't care */
+            else 
+                region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK]; 
+
+        }
+ 
+        if (!(region_base = xc_map_foreign_batch(
+                  xc_handle, dom, PROT_WRITE, region_mfn, j))) {  
+            ERR("map batch failed");
             goto out;
         }
 
         for ( i = 0; i < j; i++ )
         {
-            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
-            {
-                region_mfn[i] = 0; /* we know map will fail, but don't care */
-            }
-            else
-            {  
-                pfn = region_pfn_type[i] & ~LTAB_MASK;
-                region_mfn[i] = pfn_to_mfn_table[pfn];
-            }          
-        }
- 
-        if ( (region_base = xc_map_foreign_batch( xc_handle, dom, 
-                                                  PROT_WRITE,
-                                                  region_mfn,
-                                                  j )) == 0 )
-        {
-            ERR("map batch failed");
-            goto out;
-        }
-
-        for ( i = 0; i < j; i++ )
-        {
-            unsigned long *ppage;
-
-            pfn = region_pfn_type[i] & ~LTAB_MASK;
-
-            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
-
-            if (pfn>nr_pfns)
-            {
+            void *page;
+            unsigned long pagetype; 
+
+            pfn      = region_pfn_type[i] & ~LTAB_MASK;
+            pagetype = region_pfn_type[i] & LTAB_MASK; 
+
+            if (pagetype == XTAB) 
+                /* a bogus/unmapped page: skip it */
+                continue;
+            
+            if (pfn > max_pfn) {
                 ERR("pfn out of range");
                 goto out;
             }
 
-            region_pfn_type[i] &= LTAB_MASK;
-
-            pfn_type[pfn] = region_pfn_type[i];
-
-            mfn = pfn_to_mfn_table[pfn];
-
-            if ( verify )
-                ppage = (unsigned long*) buf;  /* debug case */
-            else
-                ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
-
-            if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
-            {
-                ERR("Error when reading pagetable page");
+            pfn_type[pfn] = pagetype; 
+
+            mfn = p2m[pfn];
+
+            /* In verify mode, we use a copy; otherwise we work in place */
+            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE); 
+
+            if (!read_exact(io_fd, page, PAGE_SIZE)) { 
+                ERR("Error when reading page (type was %lx)", pagetype);
                 goto out;
             }
 
-            switch( region_pfn_type[i] & LTABTYPE_MASK )
-            {
-            case 0:
-                break;
-
-            case L1TAB:
-            {
-                for ( k = 0; k < 1024; k++ ) 
-                {
-                    if ( ppage[k] & _PAGE_PRESENT ) 
-                    {
-                        xpfn = ppage[k] >> PAGE_SHIFT;
-                        if ( xpfn >= nr_pfns )
-                        {
-                            ERR("Frame number in type %lu page "
-                                "table is out of range. i=%d k=%d "
-                                "pfn=0x%lx nr_pfns=%lu", 
-                                region_pfn_type[i]>>28, i, 
-                                k, xpfn, nr_pfns);
-                            goto out;
-                        }
-
-                        ppage[k] &= (PAGE_SIZE - 1) & 
-                            ~(_PAGE_GLOBAL | _PAGE_PAT);
-                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
+            pagetype &= LTABTYPE_MASK; 
+
+            if(pagetype >= L1TAB && pagetype <= L4TAB) { 
+                
+                /* 
+                ** A page table page - need to 'uncanonicalize' it, i.e. 
+                ** replace all the references to pfns with the corresponding 
+                ** mfns for the new domain. 
+                */ 
+                if(!uncanonicalize_pagetable(pagetype, page))
+                    goto out; 
+
+            } else if(pagetype != NOTAB) { 
+
+                ERR("Bogus page type %lx page table is out of range: "
+                    "i=%d max_pfn=%lu", pagetype, i, max_pfn);
+                goto out;
+
+            } 
+
+
+
+            if (verify) {
+
+                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+
+                if (res) { 
+
+                    int v;
+
+                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+                            "actualcs=%08lx\n", pfn, pfn_type[pfn], 
+                            csum_page(region_base + i*PAGE_SIZE), 
+                            csum_page(buf));
+
+                    for (v = 0; v < 4; v++) {
+                        
+                        unsigned long *p = (unsigned long *) 
+                            (region_base + i*PAGE_SIZE);
+                        if (buf[v] != p[v])
+                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
                     }
                 }
             }
-            break;
-
-            case L2TAB:
-            {
-                for ( k = 0; 
-                      k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT); 
-                      k++ )
-                {
-                    if ( ppage[k] & _PAGE_PRESENT )
-                    {
-                        xpfn = ppage[k] >> PAGE_SHIFT;
-
-                        if ( xpfn >= nr_pfns )
-                        {
-                            ERR("Frame number in type %lu page"
-                                " table is out of range. i=%d k=%d "
-                                "pfn=%lu nr_pfns=%lu",
-                                region_pfn_type[i]>>28, i, k, 
-                                xpfn, nr_pfns);
-                            goto out;
-                        }
-
-                        ppage[k] &= (PAGE_SIZE - 1) & 
-                            ~(_PAGE_GLOBAL | _PAGE_PSE);
-                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
-                    }
-                }
-            }
-            break;
-
-            default:
-                ERR("Bogus page type %lx page table is "
-                    "out of range. i=%d nr_pfns=%lu", 
-                    region_pfn_type[i], i, nr_pfns);
-                goto out;
-
-            } /* end of page type switch statement */
-
-            if ( verify )
-            {
-                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
-                if ( res )
-                {
-                    int v;
-                    printf("************** pfn=%lx type=%lx gotcs=%08lx "
-                           "actualcs=%08lx\n", pfn, pfn_type[pfn], 
-                           csum_page(region_base + i*PAGE_SIZE), 
-                           csum_page(buf));
-                    for ( v = 0; v < 4; v++ )
-                    {
-                        unsigned long *p = (unsigned long *)
-                            (region_base + i*PAGE_SIZE);
-                        if ( buf[v] != p[v] )
-                            printf("    %d: %08lx %08lx\n",
-                                   v, buf[v], p[v] );
-                    }
-                }
-            }
-
-            if ( xc_add_mmu_update(xc_handle, mmu,
-                                   (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
-                                   pfn) )
-            {
-                printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
+
+            if (xc_add_mmu_update(xc_handle, mmu, 
+                                  (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+                                  pfn)) {
+                ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
                 goto out;
             }
-
         } /* end of 'batch' for loop */
 
-        munmap( region_base, j*PAGE_SIZE );
-        n+=j; /* crude stats */
+        munmap(region_base, j*PAGE_SIZE);
+        n+= j; /* crude stats */
     }
 
     DPRINTF("Received all pages\n");
 
-    if ( pt_levels == 3 )
-    {
+    if (pt_levels == 3) {
+
         /* Get all PGDs below 4GB. */
-        for ( i = 0; i < nr_pfns; i++ )
-        {
-            if ( ((pfn_type[i] & LTABTYPE_MASK) == L3TAB) &&
-                 (pfn_to_mfn_table[i] > 0xfffffUL) )
-            {
-                unsigned long new_mfn = xc_make_page_below_4G(
-                    xc_handle, dom, pfn_to_mfn_table[i]);
-                if ( new_mfn == 0 )
-                {
-                    fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
+        for (i = 0; i < max_pfn; i++) {
+            
+            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
+
+                unsigned long new_mfn; 
+
+                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
+                    ERR("Couldn't get a page below 4GB :-(");
                     goto out;
                 }
-                pfn_to_mfn_table[i] = new_mfn;
-                if ( xc_add_mmu_update(
-                    xc_handle, mmu, (new_mfn << PAGE_SHIFT) |
-                    MMU_MACHPHYS_UPDATE, i) )
-                {
-                    fprintf(stderr, "Couldn't m2p on PAE root pgdir\n");
+                
+                p2m[i] = new_mfn;
+                if (xc_add_mmu_update(
+                        xc_handle, mmu, 
+                        (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
+                    ERR("Couldn't m2p on PAE root pgdir");
                     goto out;
                 }
             }
         }
-    }
-
-    if ( xc_finish_mmu_updates(xc_handle, mmu) )
-        goto out;
+        
+    }
+
+
+    if (xc_finish_mmu_updates(xc_handle, mmu)) { 
+        ERR("Error doing finish_mmu_updates()"); 
+        goto out;
+    } 
 
     /*
      * Pin page tables. Do this after writing to them as otherwise Xen
      * will barf when doing the type-checking.
      */
-    for ( i = 0; i < nr_pfns; i++ )
-    {
+    for (i = 0; i < max_pfn; i++) {
+
         if ( (pfn_type[i] & LPINTAB) == 0 )
             continue;
-        if ( pfn_type[i] == (L1TAB|LPINTAB) )
+        
+        switch(pfn_type[i]) { 
+
+        case (L1TAB|LPINTAB): 
             pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
-        else /* pfn_type[i] == (L2TAB|LPINTAB) */
+            break; 
+            
+        case (L2TAB|LPINTAB): 
             pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
-        pin[nr_pins].arg1.mfn = pfn_to_mfn_table[i];
-        if ( ++nr_pins == MAX_PIN_BATCH )
-        {
-            if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
+            break; 
+            
+        case (L3TAB|LPINTAB): 
+            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
+            break; 
+
+        case (L4TAB|LPINTAB):
+            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
+            break; 
+            
+        default: 
+            continue; 
+        }
+
+        pin[nr_pins].arg1.mfn = p2m[i];
+        
+        if (++nr_pins == MAX_PIN_BATCH) {
+            if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) { 
+                ERR("Failed to pin batch of %d page tables", nr_pins); 
                 goto out;
+            } 
+            DPRINTF("successfully pinned batch of %d page tables", nr_pins); 
             nr_pins = 0;
         }
     }
-
-    if ( (nr_pins != 0) &&
-         (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
-        goto out;
+    
+    if (nr_pins != 0) { 
+        if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) { 
+            ERR("Failed (2) to pin batch of %d page tables", nr_pins); 
+            DPRINTF("rc is %d\n", rc); 
+            goto out;
+        }
+    }
 
     DPRINTF("\b\b\b\b100%%\n");
     DPRINTF("Memory reloaded.\n");
@@ -445,111 +493,115 @@
         unsigned long *pfntab;
         int rc;
 
-        if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
-        {
+        if (!read_exact(io_fd, &count, sizeof(count))) { 
             ERR("Error when reading pfn count");
             goto out;
         }
 
-        pfntab = malloc( sizeof(unsigned int) * count );
-        if ( pfntab == NULL )
-        {
+        if(!(pfntab = malloc(sizeof(unsigned long) * count))) { 
             ERR("Out of memory");
             goto out;
         }
-
-        if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
-             sizeof(unsigned int)*count )
-        {
+        
+        if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) { 
             ERR("Error when reading pfntab");
             goto out;
         }
 
-        for ( i = 0; i < count; i++ )
-        {
+        for (i = 0; i < count; i++) {
+
             unsigned long pfn = pfntab[i];
-            pfntab[i]=pfn_to_mfn_table[pfn];
-            pfn_to_mfn_table[pfn] = 0x80000001;  // not in pmap
-        }
-
-        if ( count > 0 )
-        {
+
+            if(pfn > max_pfn) 
+                /* shouldn't happen - continue optimistically */
+                continue; 
+
+            pfntab[i] = p2m[pfn];
+            p2m[pfn]  = 0x80000001;  // not in pmap
+        }
+        
+        if (count > 0) {
+
             struct xen_memory_reservation reservation = {
                 .extent_start = pfntab,
                 .nr_extents   = count,
                 .extent_order = 0,
                 .domid        = dom
             };
-            if ( (rc = xc_memory_op(xc_handle,
-                                    XENMEM_decrease_reservation,
-                                    &reservation)) != count )
-            {
-                ERR("Could not decrease reservation : %d",rc);
+
+            if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
+                                   &reservation)) != count) { 
+                ERR("Could not decrease reservation : %d", rc);
                 goto out;
-            }
-            else
-            {
-                printf("Decreased reservation by %d pages\n", count);
-            }
+            } else
+                DPRINTF("Decreased reservation by %d pages\n", count);
         } 
     }
 
-    if ( read_exact(io_fd, &ctxt,            sizeof(ctxt)) != sizeof(ctxt) ||
-         read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
-    {
+    if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) || 
+        !read_exact(io_fd, shared_info_page, PAGE_SIZE)) { 
         ERR("Error when reading ctxt or shared info page");
         goto out;
     }
 
     /* Uncanonicalise the suspend-record frame number and poke resume rec. */
     pfn = ctxt.user_regs.edx;
-    if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
-    {
+    if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
         ERR("Suspend record frame number is bad");
         goto out;
     }
-    ctxt.user_regs.edx = mfn = pfn_to_mfn_table[pfn];
+    ctxt.user_regs.edx = mfn = p2m[pfn];
     start_info = xc_map_foreign_range(
         xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
-    start_info->nr_pages    = nr_pfns;
+    start_info->nr_pages    = max_pfn;
     start_info->shared_info = shared_info_frame << PAGE_SHIFT;
     start_info->flags       = 0;
-    *store_mfn = start_info->store_mfn   =
-        pfn_to_mfn_table[start_info->store_mfn];
-    start_info->store_evtchn = store_evtchn;
-    *console_mfn = start_info->console_mfn   =
-        pfn_to_mfn_table[start_info->console_mfn];
-    start_info->console_evtchn = console_evtchn;
+    *store_mfn = start_info->store_mfn       = p2m[start_info->store_mfn];
+    start_info->store_evtchn                 = store_evtchn;
+    *console_mfn = start_info->console_mfn   = p2m[start_info->console_mfn];
+    start_info->console_evtchn               = console_evtchn;
     munmap(start_info, PAGE_SIZE);
 
     /* Uncanonicalise each GDT frame number. */
-    if ( ctxt.gdt_ents > 8192 )
-    {
+    if (ctxt.gdt_ents > 8192) {
         ERR("GDT entry count out of range");
         goto out;
     }
 
-    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
-    {
+    for (i = 0; i < ctxt.gdt_ents; i += 512) {
         pfn = ctxt.gdt_frames[i];
-        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
-        {
+        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
             ERR("GDT frame number is bad");
             goto out;
         }
-        ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
+        ctxt.gdt_frames[i] = p2m[pfn];
     }
 
     /* Uncanonicalise the page table base pointer. */
     pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
-    if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
-    {
-        printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
-               pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
+
+    if (pfn >= max_pfn) {
+        DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
+                pfn, max_pfn, pfn_type[pfn]); 
         ERR("PT base is bad.");
         goto out;
     }
-    ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
+
+    if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) { 
+        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
+                pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
+        ERR("PT base is bad.");
+        goto out;
+    }
+
+    if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) { 
+        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
+                pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
+        ERR("PT base is bad.");
+        goto out;
+    }
+    
+    ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
 
     /* clear any pending events and the selector */
     memset(&(shared_info->evtchn_pending[0]), 0,
@@ -558,40 +610,31 @@
         shared_info->vcpu_data[i].evtchn_pending_sel = 0;
 
     /* Copy saved contents of shared-info page. No checking needed. */
-    ppage = xc_map_foreign_range(
+    page = xc_map_foreign_range(
         xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
-    memcpy(ppage, shared_info, sizeof(shared_info_t));
-    munmap(ppage, PAGE_SIZE);
-
+    memcpy(page, shared_info, sizeof(shared_info_t));
+    munmap(page, PAGE_SIZE);
+    
     /* Uncanonicalise the pfn-to-mfn table frame-number list. */
-    for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
-    {
-        unsigned long pfn, mfn;
-
-        pfn = pfn_to_mfn_frame_list[i];
-        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
-        {
+    for (i = 0; i < P2M_FL_ENTRIES; i++) {
+        pfn = p2m_frame_list[i];
+        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
             ERR("PFN-to-MFN frame number is bad");
             goto out;
         }
-        mfn = pfn_to_mfn_table[pfn];
-        pfn_to_mfn_frame_list[i] = mfn;
-    }
-    
-    if ( (live_pfn_to_mfn_table = 
-          xc_map_foreign_batch(xc_handle, dom, 
-                               PROT_WRITE,
-                               pfn_to_mfn_frame_list,
-                               (nr_pfns+1023)/1024 )) == 0 )
-    {
-        ERR("Couldn't map pfn_to_mfn table");
-        goto out;
-    }
-
-    memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table, 
-           nr_pfns*sizeof(unsigned long) );
-
-    munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
+
+        p2m_frame_list[i] = p2m[pfn];
+    }
+    
+    /* Copy the P2M we've constructed to the 'live' P2M */
+    if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE, 
+                                          p2m_frame_list, P2M_FL_ENTRIES))) {
+        ERR("Couldn't map p2m table");
+        goto out;
+    }
+
+    memcpy(live_p2m, p2m, P2M_SIZE); 
+    munmap(live_p2m, P2M_SIZE); 
 
     /*
      * Safety checking of saved context:
@@ -605,25 +648,23 @@
      *  8. debugregs are checked by Xen.
      *  9. callback code selectors need checking.
      */
-    for ( i = 0; i < 256; i++ )
-    {
+    for ( i = 0; i < 256; i++ ) {
         ctxt.trap_ctxt[i].vector = i;
-        if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
+        if ((ctxt.trap_ctxt[i].cs & 3) == 0)
             ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
     }
-    if ( (ctxt.kernel_ss & 3) == 0 )
+    if ((ctxt.kernel_ss & 3) == 0)
         ctxt.kernel_ss = FLAT_KERNEL_DS;
 #if defined(__i386__)
-    if ( (ctxt.event_callback_cs & 3) == 0 )
+    if ((ctxt.event_callback_cs & 3) == 0)
         ctxt.event_callback_cs = FLAT_KERNEL_CS;
-    if ( (ctxt.failsafe_callback_cs & 3) == 0 )
+    if ((ctxt.failsafe_callback_cs & 3) == 0)
         ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
 #endif
-    if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
-         (ctxt.ldt_ents > 8192) ||
-         (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
-         ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
-    {
+    if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
+        (ctxt.ldt_ents > 8192) ||
+        (ctxt.ldt_base > hvirt_start) ||
+        ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
         ERR("Bad LDT base or size");
         goto out;
     }
@@ -636,8 +677,7 @@
     op.u.setdomaininfo.ctxt   = &ctxt;
     rc = xc_dom0_op(xc_handle, &op);
 
-    if ( rc != 0 )
-    {
+    if (rc != 0) {
         ERR("Couldn't build the domain");
         goto out;
     }
@@ -646,9 +686,10 @@
     if ( (rc != 0) && (dom != 0) )
         xc_domain_destroy(xc_handle, dom);
     free(mmu);
-    free(pfn_to_mfn_table);
+    free(p2m);
     free(pfn_type);
 
     DPRINTF("Restore exit with rc=%d\n", rc);
+
     return rc;
 }
diff -r abbe3df33774 -r b3c2bc39d815 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Tue Nov  8 17:39:58 2005
+++ b/tools/libxc/xc_linux_save.c       Tue Nov  8 17:42:07 2005
@@ -13,10 +13,7 @@
 #include <sys/time.h>
 
 #include "xg_private.h"
-
-#define BATCH_SIZE 1024   /* 1024 pages (4MB) at a time */
-
-#define MAX_MBIT_RATE 500
+#include "xg_save_restore.h"
 
 /*
 ** Default values for important tuning parameters. Can override by passing
@@ -25,75 +22,77 @@
 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 
 ** 
 */
-#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop */ 
-#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns */
-
-/* Flags to control behaviour of xc_linux_save */
-#define XCFLAGS_LIVE      1
-#define XCFLAGS_DEBUG     2
-
-#define DEBUG 0
-
-#if 1
-#define ERR(_f, _a...) do { fprintf(stderr, _f "\n" , ## _a); fflush(stderr); 
} while (0)
-#else
-#define ERR(_f, _a...) ((void)0)
-#endif
-
-#if DEBUG
-#define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-#define PROGRESS 0
-#if PROGRESS
-#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
-#else
-#define PPRINTF(_f, _a...)
-#endif
+#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */ 
+#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
+
+
+/* max mfn of the whole machine */
+static uint32_t max_mfn; 
+
+/* virtual starting address of the hypervisor */
+static uint32_t hvirt_start; 
+
+/* #levels of page tables used by the currrent guest */
+static uint32_t pt_levels; 
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* Live mapping of the table mapping each PFN to its current MFN. */
+static unsigned long *live_p2m = NULL;
+
+/* Live mapping of system MFN to PFN table. */
+static unsigned long *live_m2p = NULL;
+
 
 /*
  * Returns TRUE if the given machine frame number has a unique mapping
  * in the guest's pseudophysical map.
  */
-
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)                                    \
-    (((_mfn) < (1024*1024)) &&                                            \
-     ((live_mfn_to_pfn_table[_mfn] < nr_pfns) &&                         \
-       (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
-
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
+(((_mfn) < (max_mfn)) &&                        \
+ ((live_m2p[_mfn] < (max_pfn)) &&               \
+  (live_p2m[live_m2p[_mfn]] == (_mfn))))
+    
  
 /* Returns TRUE if MFN is successfully converted to a PFN. */
-#define translate_mfn_to_pfn(_pmfn)            \
-({                                             \
-    unsigned long mfn = *(_pmfn);              \
-    int _res = 1;                              \
-    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )      \
-        _res = 0;                              \
-    else                                       \
-        *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
-    _res;                                      \
+#define translate_mfn_to_pfn(_pmfn)                             \
+({                                                              \
+    unsigned long mfn = *(_pmfn);                               \
+    int _res = 1;                                               \
+    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
+        _res = 0;                                               \
+    else                                                        \
+        *(_pmfn) = live_m2p[mfn];                               \
+    _res;                                                       \
 })
 
-#define is_mapped(pfn) (!((pfn) & 0x80000000UL))
-
-static inline int test_bit ( int nr, volatile void * addr)
-{
-    return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> 
-            (nr % (sizeof(unsigned long)*8))) & 1;
-}
-
-static inline void clear_bit ( int nr, volatile void * addr)
-{
-    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &= 
-        ~(1 << (nr % (sizeof(unsigned long)*8) ) );
+/* 
+** During (live) save/migrate, we maintain a number of bitmaps to track 
+** which pages we have to send, to fixup, and to skip. 
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8) 
+#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr)); 
 }
 
 static inline void set_bit ( int nr, volatile void * addr)
 {
-    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |= 
-        (1 << (nr % (sizeof(unsigned long)*8) ) );
+    BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr)); 
 }
 
 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
@@ -142,102 +141,106 @@
 
     do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
     while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
-
+    
     return i;
 }
 
-static long long tv_to_us( struct timeval *new )
+
+
+
+static uint64_t tv_to_us(struct timeval *new)
 {
     return (new->tv_sec * 1000000) + new->tv_usec;
 }
 
-static long long llgettimeofday( void )
+static uint64_t llgettimeofday(void)
 {
     struct timeval now;
     gettimeofday(&now, NULL);
     return tv_to_us(&now);
 }
 
-static long long tv_delta( struct timeval *new, struct timeval *old )
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
 {
     return ((new->tv_sec - old->tv_sec)*1000000 ) + 
         (new->tv_usec - old->tv_usec);
 }
 
 
-#define START_MBIT_RATE 0 //ioctxt->resource
-
+#ifdef ADAPTIVE_SAVE
+
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate). 
+*/
+
+#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
+#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
+
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU      781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+
+/* We keep track of the current and previous transmission rate */
 static int mbit_rate, ombit_rate = 0;
-static int burst_time_us = -1;
-
-#define MBIT_RATE mbit_rate
-#define BURST_BUDGET (100*1024)
-
-/* 
-   1000000/((100)*1024*1024/8/(100*1024))
-   7812
-   1000000/((100)*1024/8/(100))
-   7812
-   1000000/((100)*128/(100))
-   7812
-   100000000/((100)*128)
-   7812
-   100000000/128
-   781250
- */
-#define RATE_TO_BTU 781250
-#define BURST_TIME_US burst_time_us
-
-static int
-ratewrite(int io_fd, void *buf, int n)
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) 
+
+
+static inline void initialize_mbit_rate() 
+{
+    mbit_rate = START_MBIT_RATE;
+}
+
+
+static int ratewrite(int io_fd, void *buf, int n)
 {
     static int budget = 0;
+    static int burst_time_us = -1;
     static struct timeval last_put = { 0 };
     struct timeval now;
     struct timespec delay;
     long long delta;
 
-    if ( START_MBIT_RATE == 0 )
+    if (START_MBIT_RATE == 0)
         return write(io_fd, buf, n);
     
     budget -= n;
-    if ( budget < 0 )
-    {
-        if ( MBIT_RATE != ombit_rate )
-        {
-            BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
-            ombit_rate = MBIT_RATE;
+    if (budget < 0) {
+        if (mbit_rate != ombit_rate) {
+            burst_time_us = RATE_TO_BTU / mbit_rate;
+            ombit_rate = mbit_rate;
             DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
-                    MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
-        }
-        if ( last_put.tv_sec == 0 )
-        {
+                    mbit_rate, BURST_BUDGET, burst_time_us);
+        }
+        if (last_put.tv_sec == 0) {
             budget += BURST_BUDGET;
             gettimeofday(&last_put, NULL);
-        }
-        else
-        {
-            while ( budget < 0 )
-            {
+        } else {
+            while (budget < 0) {
                 gettimeofday(&now, NULL);
                 delta = tv_delta(&now, &last_put);
-                while ( delta > BURST_TIME_US )
-                {
+                while (delta > burst_time_us) {
                     budget += BURST_BUDGET;
-                    last_put.tv_usec += BURST_TIME_US;
-                    if ( last_put.tv_usec > 1000000 )
-                    {
+                    last_put.tv_usec += burst_time_us;
+                    if (last_put.tv_usec > 1000000) {
                         last_put.tv_usec -= 1000000;
                         last_put.tv_sec++;
                     }
-                    delta -= BURST_TIME_US;
+                    delta -= burst_time_us;
                 }
-                if ( budget > 0 )
+                if (budget > 0)
                     break;
                 delay.tv_sec = 0;
-                delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
-                while ( delay.tv_nsec > 0 )
-                    if ( nanosleep(&delay, &delay) == 0 )
+                delay.tv_nsec = 1000 * (burst_time_us - delta);
+                while (delay.tv_nsec > 0)
+                    if (nanosleep(&delay, &delay) == 0)
                         break;
             }
         }
@@ -245,35 +248,52 @@
     return write(io_fd, buf, n);
 }
 
-static int print_stats( int xc_handle, uint32_t domid, 
-                        int pages_sent, xc_shadow_control_stats_t *stats,
-                        int print )
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0) 
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) 
+#define initialize_mbit_rate() 
+
+#endif
+
+
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count) 
+        return 0; 
+    return 1; 
+} 
+
+
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 
+                       xc_shadow_control_stats_t *stats, int print)
 {
     static struct timeval wall_last;
     static long long      d0_cpu_last;
     static long long      d1_cpu_last;
-
+    
     struct timeval        wall_now;
     long long             wall_delta;
     long long             d0_cpu_now, d0_cpu_delta;
     long long             d1_cpu_now, d1_cpu_delta;
-
+    
     gettimeofday(&wall_now, NULL);
-
+    
     d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
     d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
 
     if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 
         fprintf(stderr, "ARRHHH!!\n");
-
+    
     wall_delta = tv_delta(&wall_now,&wall_last)/1000;
-
-    if ( wall_delta == 0 ) wall_delta = 1;
-
-    d0_cpu_delta  = (d0_cpu_now - d0_cpu_last)/1000;
-    d1_cpu_delta  = (d1_cpu_now - d1_cpu_last)/1000;
-
-    if ( print )
+    
+    if (wall_delta == 0) wall_delta = 1;
+    
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if (print)
         fprintf(stderr,
                 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
                 "dirtied %dMb/s %" PRId32 " pages\n",
@@ -284,23 +304,25 @@
                 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
                 stats->dirty_count);
 
-    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
-    {
+#ifdef ADAPTIVE_SAVE    
+    if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
         mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
             + 50;
         if (mbit_rate > MAX_MBIT_RATE)
             mbit_rate = MAX_MBIT_RATE;
     }
-
-    d0_cpu_last  = d0_cpu_now;
-    d1_cpu_last  = d1_cpu_now;
-    wall_last = wall_now; 
+#endif
+    
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now; 
 
     return 0;
 }
 
-static int analysis_phase( int xc_handle, uint32_t domid, 
-                           int nr_pfns, unsigned long *arr, int runs )
+
+static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, 
+                          unsigned long *arr, int runs)
 {
     long long start, now;
     xc_shadow_control_stats_t stats;
@@ -308,22 +330,18 @@
 
     start = llgettimeofday();
 
-    for ( j = 0; j < runs; j++ )
-    {
+    for (j = 0; j < runs; j++) {
         int i;
-
-        xc_shadow_control( xc_handle, domid, 
-                           DOM0_SHADOW_CONTROL_OP_CLEAN,
-                           arr, nr_pfns, NULL);
+        
+        xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
+                          arr, max_pfn, NULL);
         fprintf(stderr, "#Flush\n");
-        for ( i = 0; i < 40; i++ )
-        {     
+        for ( i = 0; i < 40; i++ ) {     
             usleep(50000);     
             now = llgettimeofday();
-            xc_shadow_control( xc_handle, domid, 
-                               DOM0_SHADOW_CONTROL_OP_PEEK,
-                               NULL, 0, &stats);
-
+            xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
+                              NULL, 0, &stats);
+            
             fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
                     " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", 
                     ((now-start)+500)/1000, 
@@ -331,7 +349,7 @@
                     stats.dirty_net_count, stats.dirty_block_count);
         }
     }
-
+    
     return -1;
 }
 
@@ -345,67 +363,150 @@
 
     printf("suspend\n");
     fflush(stdout);
-    if ( fgets(ans, sizeof(ans), stdin) == NULL )
-    {
+    if (fgets(ans, sizeof(ans), stdin) == NULL) {
         ERR("failed reading suspend reply");
         return -1;
     }
-    if ( strncmp(ans, "done\n", 5) )
-    {
+    if (strncmp(ans, "done\n", 5)) {
         ERR("suspend reply incorrect: %s", ans);
         return -1;
     }
 
  retry:
 
-    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
-    {
+    if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
         ERR("Could not get domain info");
         return -1;
     }
 
-    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, 
-                                    ctxt) )
-    {
+    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt)) 
         ERR("Could not get vcpu context");
-    }
-
-    if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
-    {
+
+
+    if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
         return 0; // success
-    }
-
-    if ( info->paused )
-    {
+
+    if (info->paused) {
         // try unpausing domain, wait, and retest 
         xc_domain_unpause( xc_handle, dom );
-
+        
         ERR("Domain was paused. Wait and re-test.");
         usleep(10000);  // 10ms
-
+        
         goto retry;
     }
 
 
-    if( ++i < 100 )
-    {
+    if( ++i < 100 ) {
         ERR("Retry suspend domain.");
         usleep(10000);  // 10ms 
         goto retry;
     }
-
+    
     ERR("Unable to suspend domain.");
 
     return -1;
 }
+
+
+/*
+** During transfer (or in the state file), all page-table pages must be  
+** converted into a 'canonical' form where references to actual mfns 
+** are replaced with references to the corresponding pfns. 
+**
+** This function performs the appropriate conversion, taking into account 
+** which entries do not require canonicalization (in particular, those 
+** entries which map the virtual address reserved for the hypervisor). 
+*/
+void canonicalize_pagetable(unsigned long type, unsigned long pfn, 
+                             const void *spage, void *dpage) 
+{ 
+    
+    int i, pte_last, xen_start, xen_end;
+    uint64_t pte;
+
+    /* 
+    ** We need to determine which entries in this page table hold
+    ** reserved hypervisor mappings. This depends on the current
+    ** page table type as well as the number of paging levels. 
+    */
+    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
+    
+    if (pt_levels == 2 && type == L2TAB)
+        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
+
+    if (pt_levels == 3 && type == L3TAB) 
+        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
+        
+    /* 
+    ** in PAE only the L2 mapping the top 1GB contains Xen mappings. 
+    ** We can spot this by looking for the guest linear mapping which
+    ** Xen always ensures is present in that L2. Guests must ensure 
+    ** that this check will fail for other L2s. 
+    */
+    if (pt_levels == 3 && type == L2TAB) {
+
+/* XXX index of the L2 entry in PAE mode which holds the guest LPT */
+#define PAE_GLPT_L2ENTRY (495) 
+        pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY]; 
+
+        if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
+            xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 
+    }
+
+    /* Now iterate through the page table, canonicalizing each PTE */
+    for (i = 0; i < pte_last; i++ ) {
+
+        unsigned long pfn, mfn; 
+        
+        if (pt_levels == 2)
+            pte = ((uint32_t*)spage)[i];
+        else
+            pte = ((uint64_t*)spage)[i];
+        
+        if (i >= xen_start && i < xen_end)
+            pte = 0;
+        
+        if (pte & _PAGE_PRESENT) {
+            
+            mfn = (pte >> PAGE_SHIFT) & 0xfffffff;      
+            pfn = live_m2p[mfn];
+            
+            if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
+                /* I don't think this should ever happen */
+                DPRINTF("FNI: [%08lx,%d] pte=%llx,"
+                        " mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
+                        type, i, (uint64_t)pte, mfn, 
+                        live_m2p[mfn],
+                        (live_m2p[mfn] < max_pfn) ? 
+                        live_p2m[live_m2p[mfn]] : 0xdeadbeaf);
+                
+                pfn = 0; /* be suspicious */
+            }
+            
+            pte &= 0xffffff0000000fffULL;
+            pte |= (uint64_t)pfn << PAGE_SHIFT;
+        }
+        
+        if (pt_levels == 2)
+            ((uint32_t*)dpage)[i] = pte;
+        else
+            ((uint64_t*)dpage)[i] = pte;                      
+        
+    } 
+    
+    return; 
+}
+
+
+
 
 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 
                   uint32_t max_factor, uint32_t flags)
 {
     xc_dominfo_t info;
 
-    int rc = 1, i, j, k, last_iter, iter = 0;
-    unsigned long mfn;
+    int rc = 1, i, j, last_iter, iter = 0;
     int live  = (flags & XCFLAGS_LIVE); 
     int debug = (flags & XCFLAGS_DEBUG); 
     int sent_last_iter, skip_this_iter;
@@ -421,18 +522,16 @@
     unsigned long *pfn_batch = NULL;
 
     /* A temporary mapping, and a copy, of one frame of guest memory. */
-    unsigned long page[1024];
+    char page[PAGE_SIZE]; 
+
+    /* Double and single indirect references to the live P2M table */
+    unsigned long *live_p2m_frame_list_list = NULL;
+    unsigned long *live_p2m_frame_list = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
-    unsigned long *live_pfn_to_mfn_frame_list = NULL;
-    unsigned long pfn_to_mfn_frame_list[1024];
-
-    /* Live mapping of the table mapping each PFN to its current MFN. */
-    unsigned long *live_pfn_to_mfn_table = NULL;
-    /* Live mapping of system MFN to PFN table. */
-    unsigned long *live_mfn_to_pfn_table = NULL;
-    unsigned long mfn_to_pfn_table_start_mfn;
+    unsigned long *p2m_frame_list = NULL;
+
+    unsigned long m2p_start_mfn;
     
     /* Live mapping of shared info structure */
     shared_info_t *live_shinfo = NULL;
@@ -440,10 +539,9 @@
     /* base of the region in which domain memory is mapped */
     unsigned char *region_base = NULL;
 
-    /* number of pages we're dealing with */
-    unsigned long nr_pfns;
-
-    /* power of 2 order of nr_pfns */
+
+    
+    /* power of 2 order of max_pfn */
     int order_nr; 
 
     /* bitmap of pages:
@@ -454,207 +552,197 @@
     
     xc_shadow_control_stats_t stats;
 
-    int needed_to_fix = 0;
-    int total_sent    = 0;
-
-    MBIT_RATE = START_MBIT_RATE;
+    unsigned long needed_to_fix = 0;
+    unsigned long total_sent    = 0;
 
 
     /* If no explicit control parameters given, use defaults */
-    if( !max_iters ) 
+    if(!max_iters) 
         max_iters = DEF_MAX_ITERS; 
-    if( !max_factor ) 
+    if(!max_factor) 
         max_factor = DEF_MAX_FACTOR; 
-
-
-    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false"); 
-
-    if ( mlock(&ctxt, sizeof(ctxt)) ) 
-    {
+    
+    initialize_mbit_rate(); 
+
+    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ? 
+            "true" : "false"); 
+
+    if(!get_platform_info(xc_handle, dom, 
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERR("Unable to get platform info."); 
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERR("Could not get domain info");
+        return 1; 
+    }
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
         ERR("Unable to mlock ctxt");
         return 1;
     }
     
-    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
-    {
-        ERR("Could not get domain info");
-        goto out;
-    }
-    if ( xc_domain_get_vcpu_context(xc_handle, dom, /* FIXME */ 0, &ctxt) )
-    {
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
         ERR("Could not get vcpu context");
         goto out;
     }
     shared_info_frame = info.shared_info_frame;
 
     /* A cheesy test to see whether the domain contains valid state. */
-    if ( ctxt.ctrlreg[3] == 0 )
+    if (ctxt.ctrlreg[3] == 0)
     {
         ERR("Domain is not in a valid Linux guest OS state");
         goto out;
     }
-    
-    nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
-
-    /* cheesy sanity check */
-    if ( nr_pfns > 1024*1024 )
-    {
-        ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
-        goto out;
-    }
-
+  
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERR("Invalid state record -- pfn count out of range: %lu", 
+            (info.max_memkb >> (PAGE_SHIFT - 10))); 
+        goto out;
+     }
+ 
     /* Map the shared info frame */
-    live_shinfo = xc_map_foreign_range(
-        xc_handle, dom, PAGE_SIZE, PROT_READ, shared_info_frame);
-    if ( !live_shinfo )
-    {
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
+                                            PROT_READ, shared_info_frame))) {
         ERR("Couldn't map live_shinfo");
         goto out;
     }
 
-    live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(
-        xc_handle, dom,
-        PAGE_SIZE, PROT_READ, live_shinfo->arch.pfn_to_mfn_frame_list_list);
-
-    if (!live_pfn_to_mfn_frame_list_list){
-        ERR("Couldn't map pfn_to_mfn_frame_list_list");
-        goto out;
-    }
-
-    live_pfn_to_mfn_frame_list = 
-        xc_map_foreign_batch(xc_handle, dom, 
-                             PROT_READ,
-                             live_pfn_to_mfn_frame_list_list,
-                             (nr_pfns+(1024*1024)-1)/(1024*1024) );
-
-    if ( !live_pfn_to_mfn_frame_list)
-    {
-        ERR("Couldn't map pfn_to_mfn_frame_list");
-        goto out;
-    }
-
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    live_p2m_frame_list_list = 
+        xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 
+                             live_shinfo->arch.pfn_to_mfn_frame_list_list);
+
+    if (!live_p2m_frame_list_list) {
+        ERR("Couldn't map p2m_frame_list_list");
+        goto out;
+    }
+
+    live_p2m_frame_list = 
+        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+                             live_p2m_frame_list_list,
+                             P2M_FLL_ENTRIES); 
+    
+    if (!live_p2m_frame_list) {
+        ERR("Couldn't map p2m_frame_list");
+        goto out;
+    }
 
     /* Map all the frames of the pfn->mfn table. For migrate to succeed, 
        the guest must not change which frames are used for this purpose. 
        (its not clear why it would want to change them, and we'll be OK
        from a safety POV anyhow. */
 
-    live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom, 
-                                                 PROT_READ,
-                                                 live_pfn_to_mfn_frame_list,
-                                                 (nr_pfns+1023)/1024 );  
-    if ( !live_pfn_to_mfn_table )
-    {
-        ERR("Couldn't map pfn_to_mfn table");
+    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
+                                    live_p2m_frame_list,
+                                    P2M_FL_ENTRIES); 
+
+    if (!live_p2m) {
+        ERR("Couldn't map p2m table");
         goto out;
     }
 
     /* Setup the mfn_to_pfn table mapping */
-    mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
-
-    live_mfn_to_pfn_table = 
-        xc_map_foreign_range(xc_handle, DOMID_XEN, 
-                             PAGE_SIZE*1024, PROT_READ, 
-                             mfn_to_pfn_table_start_mfn );
+    m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
+    live_m2p      = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE, 
+                                         PROT_READ, m2p_start_mfn);
+    
+    /* Get a local copy fo the live_P2M_frame_list */
+    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
+        ERR("Couldn't allocate p2m_frame_list array");
+        goto out;
+    }
+    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); 
 
     /* Canonicalise the pfn-to-mfn table frame-number list. */
-    memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
-
-    for ( i = 0; i < nr_pfns; i += 1024 )
-    {
-        if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
-        {
+    for (i = 0; i < max_pfn; i += ulpp) {
+        if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) { 
             ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
             goto out;
         }
     }
 
-
     /* Domain is still running at this point */
 
-    if ( live )
-    {
-        if ( xc_shadow_control( xc_handle, dom, 
-                                DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
-                                NULL, 0, NULL ) < 0 )
-        {
+    if (live) {
+
+        if (xc_shadow_control(xc_handle, dom, 
+                              DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
+                              NULL, 0, NULL ) < 0) { 
             ERR("Couldn't enable shadow mode");
             goto out;
         }
-
+        
         last_iter = 0;
-    } 
-    else
-    {
+        
+    } else {
+        
         /* This is a non-live suspend. Issue the call back to get the
            domain suspended */
-
+        
         last_iter = 1;
-
-        if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
-        {
+        
+        if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
             ERR("Domain appears not to have suspended");
             goto out;
         }
-
-    }
-    sent_last_iter = 1<<20; /* 4GB of pages */
-
-    /* calculate the power of 2 order of nr_pfns, e.g.
+        
+    }
+
+#if 0
+    sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
+#else
+    sent_last_iter = 1 << 20; 
+#endif
+
+
+    /* calculate the power of 2 order of max_pfn, e.g.
        15->4 16->4 17->5 */
-    for ( i = nr_pfns-1, order_nr = 0; i ; i >>= 1, order_nr++ )
+    for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
         continue;
 
-    /* Setup to_send bitmap */
-    {
-        /* size these for a maximal 4GB domain, to make interaction
-           with balloon driver easier. It's only user space memory,
-           ater all... (3x 128KB) */
-
-        int sz = ( 1<<20 ) / 8;
- 
-        to_send = malloc( sz );
-        to_fix  = calloc( 1, sz );
-        to_skip = malloc( sz );
-
-        if ( !to_send || !to_fix || !to_skip )
-        {
-            ERR("Couldn't allocate to_send array");
-            goto out;
-        }
-
-        memset(to_send, 0xff, sz);
-
-        if ( mlock(to_send, sz) )
-        {
-            ERR("Unable to mlock to_send");
-            return 1;
-        }
-
-        /* (to fix is local only) */
-
-        if ( mlock(to_skip, sz) )
-        {
-            ERR("Unable to mlock to_skip");
-            return 1;
-        }
-
-    }
-
-    analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
+#undef BITMAP_SIZE
+#define BITMAP_SIZE ((1<<20)/8) 
+
+    /* Setup to_send / to_fix and to_skip bitmaps */
+    to_send = malloc(BITMAP_SIZE); 
+    to_fix  = calloc(1, BITMAP_SIZE); 
+    to_skip = malloc(BITMAP_SIZE); 
+    
+    if (!to_send || !to_fix || !to_skip) {
+        ERR("Couldn't allocate to_send array");
+        goto out;
+    }
+    
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if (mlock(to_send, BITMAP_SIZE)) {
+        ERR("Unable to mlock to_send");
+        return 1;
+    }
+
+    /* (to fix is local only) */
+    if (mlock(to_skip, BITMAP_SIZE)) {
+        ERR("Unable to mlock to_skip");
+        return 1;
+    }
+        
+    analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
 
     /* We want zeroed memory so use calloc rather than malloc. */
-    pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
-    pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
-
-    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
-    {
+    pfn_type  = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
+    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
+
+    if ((pfn_type == NULL) || (pfn_batch == NULL)) {
         errno = ENOMEM;
         goto out;
     }
 
-    if ( mlock(pfn_type, BATCH_SIZE * sizeof(unsigned long)) )
-    {
+    if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
         ERR("Unable to mlock");
         goto out;
     }
@@ -663,46 +751,40 @@
     /*
      * Quick belt and braces sanity check.
      */
-#if DEBUG
     {
         int err=0;
-        for ( i = 0; i < nr_pfns; i++ )
-        {
-            mfn = live_pfn_to_mfn_table[i];
-     
-            if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
-            {
-                fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
-                        i,mfn,live_mfn_to_pfn_table[mfn]);
+        unsigned long mfn; 
+        for (i = 0; i < max_pfn; i++) {
+
+            mfn = live_p2m[i];
+            if((live_m2p[mfn] != i) && (mfn != 0xffffffffUL)) { 
+                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, 
+                        mfn, live_m2p[mfn]);
                 err++;
             }
         }
-        fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
-    }
-#endif
+        DPRINTF("Had %d unexplained entries in p2m table\n", err);
+    }
 
 
     /* Start writing out the saved-domain record. */
 
-    if ( write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
-         sizeof(unsigned long) )
-    {
-        ERR("write: nr_pfns");
-        goto out;
-    }
-
-    if ( write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE )
-    {
-        ERR("write: pfn_to_mfn_frame_list");
-        goto out;
-    }
-
-    print_stats( xc_handle, dom, 0, &stats, 0 );
+    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { 
+        ERR("write: max_pfn");
+        goto out;
+    }
+
+    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
+        ERR("write: p2m_frame_list");
+        goto out;
+    }
+    
+    print_stats(xc_handle, dom, 0, &stats, 0);
 
     /* Now write out each data page, canonicalising page tables as we go... */
-    
-    for ( ; ; )
-    {
+
+    while(1) {
+
         unsigned int prev_pc, sent_this_iter, N, batch;
 
         iter++;
@@ -713,24 +795,20 @@
 
         DPRINTF("Saving memory pages: iter %d   0%%", iter);
 
-        while ( N < nr_pfns )
-        {
-            unsigned int this_pc = (N * 100) / nr_pfns;
-
-            if ( (this_pc - prev_pc) >= 5 )
-            {
+        while( N < max_pfn ){
+
+            unsigned int this_pc = (N * 100) / max_pfn;
+
+            if ((this_pc - prev_pc) >= 5) {
                 DPRINTF("\b\b\b\b%3d%%", this_pc);
                 prev_pc = this_pc;
             }
-
+            
             /* slightly wasteful to peek the whole array evey time, 
                but this is fast enough for the moment. */
-
-            if ( !last_iter && 
-                 xc_shadow_control(xc_handle, dom, 
-                                   DOM0_SHADOW_CONTROL_OP_PEEK,
-                                   to_skip, nr_pfns, NULL) != nr_pfns )
-            {
+            if (!last_iter && xc_shadow_control(
+                    xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
+                    to_skip, max_pfn, NULL) != max_pfn) {
                 ERR("Error peeking shadow bitmap");
                 goto out;
             }
@@ -738,219 +816,168 @@
 
             /* load pfn_type[] with the mfn of all the pages we're doing in
                this batch. */
-
-            for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
-            {
-                int n = permute(N, nr_pfns, order_nr );
-
-                if ( 0 && debug ) {
-                    fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d  "
-                            " [mfn]= %08lx\n",
-                            iter, (unsigned long)n, live_pfn_to_mfn_table[n],
-                            test_bit(n,to_send),
-                            live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
-                                                 0xFFFFF]);
+            for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
+
+                int n = permute(N, max_pfn, order_nr);
+
+                if (debug) {
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
+                            iter, (unsigned long)n, live_p2m[n],
+                            test_bit(n, to_send), 
+                            live_m2p[live_p2m[n]&0xFFFFF]);
                 }
-
-                if ( !last_iter && 
-                     test_bit(n, to_send) && 
-                     test_bit(n, to_skip) ) {
+                
+                if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip)) 
                     skip_this_iter++; /* stats keeping */
-                }
-
-                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
-                       (test_bit(n, to_send) && last_iter) ||
-                       (test_bit(n, to_fix)  && last_iter)) ) {
+
+                if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+                      (test_bit(n, to_send) && last_iter) ||
+                      (test_bit(n, to_fix)  && last_iter)))
                     continue;
-                }
-
-                /* we get here if:
-                   1. page is marked to_send & hasn't already been re-dirtied
-                   2. (ignore to_skip in last iteration)
-                   3. add in pages that still need fixup (net bufs)
+
+                /* 
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. (ignore to_skip in last iteration)
+                **  3. add in pages that still need fixup (net bufs)
                 */
   
                 pfn_batch[batch] = n;
-                pfn_type[batch] = live_pfn_to_mfn_table[n];
-
-                if( ! is_mapped(pfn_type[batch]) )
-                {
+                pfn_type[batch]  = live_p2m[n];
+
+                if(!is_mapped(pfn_type[batch])) {
+
                     /* not currently in pusedo-physical map -- set bit
                        in to_fix that we must send this page in last_iter
                        unless its sent sooner anyhow */
 
-                    set_bit( n, to_fix );
-                    if( iter>1 )
+                    set_bit(n, to_fix);
+                    if(iter > 1)
                         DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
-                                iter,n,pfn_type[batch]);
+                                iter, n, pfn_type[batch]);
                     continue;
                 }
 
-                if ( last_iter && 
-                     test_bit(n, to_fix) && 
-                     !test_bit(n, to_send) )
-                {
+                if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
                     needed_to_fix++;
                     DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
                             iter,n,pfn_type[batch]);
                 }
 
                 clear_bit(n, to_fix); 
-
+                
                 batch++;
             }
      
-            if ( batch == 0 )
+            if (batch == 0)
                 goto skip; /* vanishingly unlikely... */
       
-            if ( (region_base = xc_map_foreign_batch(xc_handle, dom, 
-                                                     PROT_READ,
-                                                     pfn_type,
-                                                     batch)) == 0 ){
+            if ((region_base = xc_map_foreign_batch(
+                     xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) { 
                 ERR("map batch failed");
                 goto out;
             }
      
-            if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
+            if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
                 ERR("get_pfn_type_batch failed");
                 goto out;
             }
      
-            for ( j = 0; j < batch; j++ )
-            {
-                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
-                {
-                    DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
+            for (j = 0; j < batch; j++) {
+
+                if ((pfn_type[j] & LTAB_MASK) == XTAB) {
+                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
                     continue;
                 }
   
-                if ( 0 && debug )
+                if (debug) 
                     fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
                             " sum= %08lx\n",
                             iter, 
                             (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
                             pfn_type[j],
-                            live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
+                            live_m2p[pfn_type[j]&(~LTAB_MASK)],
                             csum_page(region_base + (PAGE_SIZE*j)));
-
+                
                 /* canonicalise mfn->pfn */
                 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
             }
 
-            if ( write(io_fd, &batch, sizeof(int)) != sizeof(int) )
-            {
+            if(!write_exact(io_fd, &batch, sizeof(unsigned int))) { 
                 ERR("Error when writing to state file (2)");
                 goto out;
             }
 
-            if ( write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
-                 (sizeof(unsigned long) * j) )
-            {
+            if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) { 
                 ERR("Error when writing to state file (3)");
                 goto out;
             }
-     
+            
             /* entering this loop, pfn_type is now in pfns (Not mfns) */
-            for ( j = 0; j < batch; j++ )
-            {
+            for (j = 0; j < batch; j++) {
+                
+                unsigned long pfn      = pfn_type[j] & ~LTAB_MASK; 
+                unsigned long pagetype = pfn_type[j] & LTAB_MASK; 
+                void *spage            = (void *) region_base + (PAGE_SIZE*j); 
+
+
                 /* write out pages in batch */
-                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
-                {
-                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
+                if (pagetype == XTAB) {
+                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
                     continue;
                 }
-  
-                if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) || 
-                     ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
-                    memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
-      
-                    for ( k = 0; 
-                          k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ? 
-                               (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
-                               1024); 
-                          k++ )
-                    {
-                        unsigned long pfn;
-
-                        if ( !(page[k] & _PAGE_PRESENT) )
-                            continue;
-                        
-                        mfn = page[k] >> PAGE_SHIFT;      
-                        pfn = live_mfn_to_pfn_table[mfn];
-
-                        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
-                        {
-                            /* I don't think this should ever happen */
-                            fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
-                                    "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
-                                    j, pfn_type[j], k,
-                                    page[k], mfn, live_mfn_to_pfn_table[mfn],
-                                    (live_mfn_to_pfn_table[mfn]<nr_pfns)? 
-                                    live_pfn_to_mfn_table[
-                                        live_mfn_to_pfn_table[mfn]] : 
-                                    0xdeadbeef);
-
-                            pfn = 0; /* be suspicious */
-                        }
-
-                        page[k] &= PAGE_SIZE - 1;
-                        page[k] |= pfn << PAGE_SHIFT;
-   
-#if 0
-                        fprintf(stderr,
-                                "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
-                                "xpfn=%d\n",
-                                pfn_type[j]>>28,
-                                j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
-#endif     
-   
-                    } /* end of page table rewrite for loop */
-      
+
+                pagetype &= LTABTYPE_MASK; 
+                
+                if (pagetype >= L1TAB && pagetype <= L4TAB) {
+                    
+                    /* We have a pagetable page: need to rewrite it. */
+                    canonicalize_pagetable(pagetype, pfn, spage, page); 
+                    
                     if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
                         ERR("Error when writing to state file (4)");
                         goto out;
                     }
-      
-                }  /* end of it's a PT page */ else {  /* normal page */
-
-                    if ( ratewrite(io_fd, region_base + (PAGE_SIZE*j), 
-                                   PAGE_SIZE) != PAGE_SIZE )
-                    {
+                    
+                }  else {  
+
+                    /* We have a normal page: just write it directly. */
+                    if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
                         ERR("Error when writing to state file (5)");
                         goto out;
                     }
                 }
             } /* end of the write out for this batch */
-     
+            
             sent_this_iter += batch;
-
+            
         } /* end of this while loop for this iteration */
-
+        
         munmap(region_base, batch*PAGE_SIZE);
-
-    skip: 
-
+        
+      skip: 
+        
         total_sent += sent_this_iter;
 
         DPRINTF("\r %d: sent %d, skipped %d, ", 
                 iter, sent_this_iter, skip_this_iter );
 
-        if ( last_iter ) {
+        if (last_iter) {
             print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
 
-            DPRINTF("Total pages sent= %d (%.2fx)\n", 
-                    total_sent, ((float)total_sent)/nr_pfns );
-            DPRINTF("(of which %d were fixups)\n", needed_to_fix  );
+            DPRINTF("Total pages sent= %ld (%.2fx)\n", 
+                    total_sent, ((float)total_sent)/max_pfn );
+            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
         }       
 
         if (last_iter && debug){
             int minusone = -1;
-            memset( to_send, 0xff, (nr_pfns+8)/8 );
+            memset( to_send, 0xff, (max_pfn+8)/8 );
             debug = 0;
             fprintf(stderr, "Entering debug resend-all mode\n");
     
             /* send "-1" to put receiver into debug mode */
-            if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
+            if(!write_exact(io_fd, &minusone, sizeof(int))) { 
                 ERR("Error when writing to state file (6)");
                 goto out;
             }
@@ -958,42 +985,39 @@
             continue;
         }
 
-        if ( last_iter ) break; 
-
-        if ( live )
-        {
-            if ( 
-                ( ( sent_this_iter > sent_last_iter ) &&
-                  (mbit_rate == MAX_MBIT_RATE ) ) ||
-                (iter >= max_iters) || 
-                (sent_this_iter+skip_this_iter < 50) || 
-                (total_sent > nr_pfns*max_factor) )
-            {
+        if (last_iter) break; 
+
+        if (live) {
+
+
+            if( 
+                ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
+                (iter >= max_iters) ||
+                (sent_this_iter+skip_this_iter < 50) ||
+                (total_sent > max_pfn*max_factor) ) { 
+
                 DPRINTF("Start last iteration\n");
                 last_iter = 1;
-
-                if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
-                {
+                
+                if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
                     ERR("Domain appears not to have suspended");
                     goto out;
                 }
-
+                
                 DPRINTF("SUSPEND shinfo %08lx eip %08u edx %08u\n",
                         info.shared_info_frame,
                         ctxt.user_regs.eip, ctxt.user_regs.edx);
             } 
-
-            if ( xc_shadow_control( xc_handle, dom, 
-                                    DOM0_SHADOW_CONTROL_OP_CLEAN,
-                                    to_send, nr_pfns, &stats ) != nr_pfns ) 
-            {
+            
+            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
+                                  to_send, max_pfn, &stats ) != max_pfn) {  
                 ERR("Error flushing shadow PT");
                 goto out;
             }
 
             sent_last_iter = sent_this_iter;
 
-            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
      
         }
 
@@ -1005,9 +1029,10 @@
     /* Success! */
     rc = 0;
     
+    /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
+    
     /* Zero terminate */
-    if ( write(io_fd, &rc, sizeof(int)) != sizeof(int) )
-    {
+    if (!write_exact(io_fd, &rc, sizeof(int))) { 
         ERR("Error when writing to state file (6)");
         goto out;
     }
@@ -1015,84 +1040,76 @@
     /* Send through a list of all the PFNs that were not in map at the close */
     {
         unsigned int i,j;
-        unsigned int pfntab[1024];
-
-        for ( i = 0, j = 0; i < nr_pfns; i++ )
-            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
+        unsigned long pfntab[1024]; 
+
+        for ( i = 0, j = 0; i < max_pfn; i++ ) {
+            if ( ! is_mapped(live_p2m[i]) )
                 j++;
-
-        if ( write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int) )
-        {
+        }
+
+        if(!write_exact(io_fd, &j, sizeof(unsigned int))) { 
             ERR("Error when writing to state file (6a)");
             goto out;
-        } 
-
-        for ( i = 0, j = 0; i < nr_pfns; )
-        {
-            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
-            {
+        }      
+        
+        for ( i = 0, j = 0; i < max_pfn; ) {
+
+            if (!is_mapped(live_p2m[i]))
                 pfntab[j++] = i;
-            }
+
             i++;
-            if ( j == 1024 || i == nr_pfns )
-            {
-                if ( write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
-                     (sizeof(unsigned long) * j) )
-                {
+            if (j == 1024 || i == max_pfn) {
+                if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) { 
                     ERR("Error when writing to state file (6b)");
                     goto out;
                 } 
                 j = 0;
             }
         }
-    }
-
+
+    }
+    
     /* Canonicalise the suspend-record frame number. */
-    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
-    {
+    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
         ERR("Suspend record is not in range of pseudophys map");
         goto out;
     }
 
     /* Canonicalise each GDT frame number. */
-    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
-    {
-        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) 
-        {
+    for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
+        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
             ERR("GDT frame is not in range of pseudophys map");
             goto out;
         }
     }
 
     /* Canonicalise the page table base pointer. */
-    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) )
-    {
+    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
         ERR("PT base is not in range of pseudophys map");
         goto out;
     }
-    ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
+    ctxt.ctrlreg[3] = live_m2p[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
         PAGE_SHIFT;
 
-    if ( write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
-         write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE)
-    {
+    if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
+        !write_exact(io_fd, live_shinfo, PAGE_SIZE)) { 
         ERR("Error when writing to state file (1)");
         goto out;
     }
-
+    
  out:
 
-    if ( live_shinfo )
+    if (live_shinfo)
         munmap(live_shinfo, PAGE_SIZE);
-
-    if ( live_pfn_to_mfn_frame_list ) 
-        munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
-
-    if ( live_pfn_to_mfn_table ) 
-        munmap(live_pfn_to_mfn_table, nr_pfns*4);
-
-    if ( live_mfn_to_pfn_table ) 
-        munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
+    
+    if (live_p2m_frame_list) 
+        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); 
+
+    if(live_p2m) 
+        munmap(live_p2m, P2M_SIZE); 
+
+    if(live_m2p) 
+        munmap(live_m2p, M2P_SIZE); 
 
     free(pfn_type);
     free(pfn_batch);
@@ -1101,6 +1118,7 @@
     free(to_skip);
 
     DPRINTF("Save exit rc=%d\n",rc);
+
     return !!rc;
 }
 
diff -r abbe3df33774 -r b3c2bc39d815 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Tue Nov  8 17:39:58 2005
+++ b/tools/libxc/xenctrl.h     Tue Nov  8 17:42:07 2005
@@ -17,6 +17,7 @@
 #include <xen/event_channel.h>
 #include <xen/sched.h>
 #include <xen/sched_ctl.h>
+#include <xen/memory.h>
 #include <xen/acm.h>
 
 #ifdef __ia64__
diff -r abbe3df33774 -r b3c2bc39d815 tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h  Tue Nov  8 17:39:58 2005
+++ b/tools/libxc/xg_private.h  Tue Nov  8 17:42:07 2005
@@ -11,8 +11,10 @@
 #include <sys/stat.h>
 
 #include "xenctrl.h"
+#include "xenguest.h" 
 
 #include <xen/linux/privcmd.h>
+#include <xen/memory.h>
 
 char *xc_read_kernel_image(const char *filename, unsigned long *size);
 unsigned long csum_page (void * page);
diff -r abbe3df33774 -r b3c2bc39d815 tools/libxc/xg_save_restore.h
--- /dev/null   Tue Nov  8 17:39:58 2005
+++ b/tools/libxc/xg_save_restore.h     Tue Nov  8 17:42:07 2005
@@ -0,0 +1,123 @@
+/*
+** xg_save_restore.h
+** 
+** Defintions and utilities for save / restore. 
+*/
+
+#define DEBUG    1
+#define PROGRESS 0
+
+#define ERR(_f, _a...) do {                     \
+    fprintf(stderr, _f "\n" , ## _a);           \
+    fflush(stderr); }                           \
+while (0)
+
+#if DEBUG
+#define DPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+#if PROGRESS
+#define PPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
+#else
+#define PPRINTF(_f, _a...)
+#endif
+
+
+/*
+** We process save/restore/migrate in batches of pages; the below 
+** determines how many pages we (at maximum) deal with in each batch. 
+*/
+#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
+
+/* When pinning page tables at the end of restore, we also use batching. */
+#define MAX_PIN_BATCH  1024
+
+
+
+/*
+** Determine various platform information required for save/restore, in 
+** particular: 
+**
+**    - the maximum MFN on this machine, used to compute the size of 
+**      the M2P table; 
+** 
+**    - the starting virtual address of the the hypervisor; we use this 
+**      to determine which parts of guest address space(s) do and don't 
+**      require canonicalization during save/restore; and 
+** 
+**    - the number of page-table levels for save/ restore. This should 
+**      be a property of the domain, but for the moment we just read it 
+**      from the hypervisor.
+**
+** Returns 1 on success, 0 on failure. 
+*/
+static int get_platform_info(int xc_handle, uint32_t dom, 
+                             /* OUT */ uint32_t *max_mfn,  
+                             /* OUT */ uint32_t *hvirt_start, 
+                             /* OUT */ uint32_t *pt_levels)
+    
+{ 
+    xen_capabilities_info_t xen_caps = "";
+    xen_parameters_info_t xen_parms;
+    xc_physinfo_t physinfo;
+    
+    if (xc_physinfo(xc_handle, &physinfo) != 0) 
+        return 0;
+    
+    if (xc_version(xc_handle, XENVER_parameters, &xen_parms) != 0)
+        return 0;
+    
+    if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
+        return 0;
+
+    *max_mfn =     physinfo.total_pages;
+    *hvirt_start = xen_parms.virt_start;
+
+    if (strstr(xen_caps, "xen-3.0-x86_64"))
+        *pt_levels = 4;
+    else if (strstr(xen_caps, "xen-3.0-x86_32p"))
+        *pt_levels = 3; 
+    else if (strstr(xen_caps, "xen-3.0-x86_32"))
+        *pt_levels = 2; 
+    else 
+        return 0; 
+    
+    return 1;
+} 
+
+
+/* 
+** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. 
+** The M2P simply holds the corresponding PFN, while the top bit of a P2M
+** entry tell us whether or not the the PFN is currently mapped.
+*/
+
+#define PFN_TO_KB(_pfn) ((_pfn) * PAGE_SIZE / 1024)
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
+
+/* Size in bytes of the M2P and P2M (both rounded up to nearest PAGE_SIZE) */
+#define M2P_SIZE ROUNDUP((max_mfn * sizeof(unsigned long)), PAGE_SHIFT) 
+#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT) 
+
+
+/* Number of unsigned longs in a page */
+#define ulpp            (PAGE_SIZE/sizeof(unsigned long))
+
+/* Number of entries in the pfn_to_mfn_frame_list */
+#define P2M_FL_ENTRIES  (((max_pfn)+ulpp-1)/ulpp)
+
+/* Size in bytes of the pfn_to_mfn_frame_list     */
+#define P2M_FL_SIZE     ((P2M_FL_ENTRIES)*sizeof(unsigned long))
+
+/* Number of entries in the pfn_to_mfn_frame_list_list */
+#define P2M_FLL_ENTRIES (((max_pfn)+(ulpp*ulpp)-1)/(ulpp*ulpp))
+
+/* Returns TRUE if the PFN is currently mapped */
+#define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
+
+
+
+

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] Enable save/restore for PAE domains., Xen patchbot -unstable <=