# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1257794367 0
# Node ID f6091947ffed5707b60120fe44d3d038a3307591
# Parent 01c9cb566f61f143870600b30857562963aaee27
Remus: Make checkpoint buffering HVM-aware
Signed-off-by: Brendan Cully <brendan@xxxxxxxxx>
---
tools/libxc/xc_domain_restore.c | 387 +++++++++++++++++++++++---------
tools/python/xen/xend/XendCheckpoint.py | 20 -
2 files changed, 289 insertions(+), 118 deletions(-)
diff -r 01c9cb566f61 -r f6091947ffed tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c Mon Nov 09 19:17:22 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c Mon Nov 09 19:19:27 2009 +0000
@@ -670,15 +670,204 @@ static xen_pfn_t *load_p2m_frame_list(
}
typedef struct {
- unsigned int pfncount;
- unsigned long* pfntab;
- unsigned int vcpucount;
- unsigned char* vcpubuf;
- unsigned char shared_info_page[PAGE_SIZE];
+ int ishvm;
+ union {
+ struct tailbuf_pv {
+ unsigned int pfncount;
+ unsigned long* pfntab;
+ unsigned int vcpucount;
+ unsigned char* vcpubuf;
+ unsigned char shared_info_page[PAGE_SIZE];
+ } pv;
+ struct tailbuf_hvm {
+ uint64_t magicpfns[3];
+ uint32_t hvmbufsize, reclen;
+ uint8_t* hvmbuf;
+ struct {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t len;
+ } qemuhdr;
+ uint32_t qemubufsize;
+ uint8_t* qemubuf;
+ } hvm;
+ } u;
} tailbuf_t;
-static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
- uint64_t vcpumap, int ext_vcpucontext)
+/* read stream until EOF, growing buffer as necssary */
+static int compat_buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+ uint8_t *qbuf, *tmp;
+ int blen = 0, dlen = 0;
+ int rc;
+
+ /* currently save records tend to be about 7K */
+ blen = 8192;
+ if ( !(qbuf = malloc(blen)) ) {
+ ERROR("Error allocating QEMU buffer");
+ return -1;
+ }
+
+ while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
+ DPRINTF("Read %d bytes of QEMU data\n", rc);
+ dlen += rc;
+
+ if (dlen == blen) {
+ DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
+ blen += 4096;
+ tmp = realloc(qbuf, blen);
+ if ( !tmp ) {
+ ERROR("Error growing QEMU buffer to %d bytes", blen);
+ free(qbuf);
+ return -1;
+ }
+ qbuf = tmp;
+ }
+ }
+
+ if ( rc < 0 ) {
+ ERROR("Error reading QEMU data");
+ free(qbuf);
+ return -1;
+ }
+
+ if ( memcmp(qbuf, "QEVM", 4) ) {
+ ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf);
+ free(qbuf);
+ return -1;
+ }
+
+ buf->qemubuf = qbuf;
+ buf->qemubufsize = dlen;
+
+ return 0;
+}
+
+static int buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+ uint32_t qlen;
+ uint8_t *tmp;
+
+ if ( read_exact(fd, &qlen, sizeof(qlen)) ) {
+ ERROR("Error reading QEMU header length");
+ return -1;
+ }
+
+ if ( qlen > buf->qemubufsize ) {
+ if ( buf->qemubuf) {
+ tmp = realloc(buf->qemubuf, qlen);
+ if ( tmp )
+ buf->qemubuf = tmp;
+ else {
+ ERROR("Error reallocating QEMU state buffer");
+ return -1;
+ }
+ } else {
+ buf->qemubuf = malloc(qlen);
+ if ( !buf->qemubuf ) {
+ ERROR("Error allocating QEMU state buffer");
+ return -1;
+ }
+ }
+ }
+ buf->qemubufsize = qlen;
+
+ if ( read_exact(fd, buf->qemubuf, buf->qemubufsize) ) {
+ ERROR("Error reading QEMU state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_qemu(uint32_t dom, struct tailbuf_hvm *buf)
+{
+ int saved_errno;
+ char path[256];
+ FILE *fp;
+
+ sprintf(path, "/var/lib/xen/qemu-save.%u", dom);
+ fp = fopen(path, "wb");
+ if ( !fp )
+ return -1;
+
+ DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
+ if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
+ saved_errno = errno;
+ fclose(fp);
+ errno = saved_errno;
+ return -1;
+ }
+
+ fclose(fp);
+
+ return 0;
+}
+
+static int buffer_tail_hvm(struct tailbuf_hvm *buf, int fd,
+ unsigned int max_vcpu_id, uint64_t vcpumap,
+ int ext_vcpucontext)
+{
+ uint8_t *tmp;
+ unsigned char qemusig[21];
+
+ if ( read_exact(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
+ ERROR("Error reading magic PFNs");
+ return -1;
+ }
+
+ if ( read_exact(fd, &buf->reclen, sizeof(buf->reclen)) ) {
+ ERROR("Error reading HVM params size");
+ return -1;
+ }
+
+ if ( buf->reclen > buf->hvmbufsize ) {
+ if ( buf->hvmbuf) {
+ tmp = realloc(buf->hvmbuf, buf->reclen);
+ if ( tmp ) {
+ buf->hvmbuf = tmp;
+ buf->hvmbufsize = buf->reclen;
+ } else {
+ ERROR("Error reallocating HVM param buffer");
+ return -1;
+ }
+ } else {
+ buf->hvmbuf = malloc(buf->reclen);
+ if ( !buf->hvmbuf ) {
+ ERROR("Error allocating HVM param buffer");
+ return -1;
+ }
+ buf->hvmbufsize = buf->reclen;
+ }
+ }
+
+ if ( read_exact(fd, buf->hvmbuf, buf->reclen) ) {
+ ERROR("Error reading HVM params");
+ return -1;
+ }
+
+ if ( read_exact(fd, qemusig, sizeof(qemusig)) ) {
+ ERROR("Error reading QEMU signature");
+ return -1;
+ }
+
+ /* The normal live-migration QEMU record has no length information.
+ * Short of reimplementing the QEMU parser, we're forced to just read
+ * until EOF. Remus gets around this by sending a different signature
+ * which includes a length prefix */
+ if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
+ return compat_buffer_qemu(fd, buf);
+ else if ( !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
+ return buffer_qemu(fd, buf);
+
+ qemusig[20] = '\0';
+ ERROR("Invalid QEMU signature: %s", qemusig);
+ return -1;
+}
+
+static int buffer_tail_pv(struct tailbuf_pv *buf, int fd,
+ unsigned int max_vcpu_id, uint64_t vcpumap,
+ int ext_vcpucontext)
{
unsigned int i;
size_t pfnlen, vcpulen;
@@ -753,16 +942,47 @@ static int buffer_tail(tailbuf_t* buf, i
return -1;
}
-static void tailbuf_free(tailbuf_t* buf)
-{
- if (buf->vcpubuf) {
+static int buffer_tail(tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
+ uint64_t vcpumap, int ext_vcpucontext)
+{
+ if ( buf->ishvm )
+ return buffer_tail_hvm(&buf->u.hvm, fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext);
+ else
+ return buffer_tail_pv(&buf->u.pv, fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext);
+}
+
+static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
+{
+ if ( buf->hvmbuf ) {
+ free(buf->hvmbuf);
+ buf->hvmbuf = NULL;
+ }
+ if ( buf->qemubuf ) {
+ free(buf->qemubuf);
+ buf->qemubuf = NULL;
+ }
+}
+
+static void tailbuf_free_pv(struct tailbuf_pv *buf)
+{
+ if ( buf->vcpubuf ) {
free(buf->vcpubuf);
buf->vcpubuf = NULL;
}
- if (buf->pfntab) {
+ if ( buf->pfntab ) {
free(buf->pfntab);
buf->pfntab = NULL;
}
+}
+
+static void tailbuf_free(tailbuf_t *buf)
+{
+ if ( buf->ishvm )
+ tailbuf_free_hvm(&buf->u.hvm);
+ else
+ tailbuf_free_pv(&buf->u.pv);
}
typedef struct {
@@ -1118,18 +1338,13 @@ int xc_domain_restore(int xc_handle, int
unsigned int max_vcpu_id = 0;
int new_ctxt_format = 0;
- /* Magic frames in HVM guests: ioreqs and xenstore comms. */
- uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
- /* Buffer for holding HVM context */
- uint8_t *hvm_buf = NULL;
-
pagebuf_t pagebuf;
tailbuf_t tailbuf, tmptail;
void* vcpup;
pagebuf_init(&pagebuf);
memset(&tailbuf, 0, sizeof(tailbuf));
+ tailbuf.ishvm = hvm;
/* For info only */
nr_pfns = 0;
@@ -1313,78 +1528,6 @@ int xc_domain_restore(int xc_handle, int
// DPRINTF("Received all pages (%d races)\n", nraces);
- if ( hvm )
- {
- uint32_t rec_len;
-
- /* Set HVM-specific parameters */
- if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
- {
- ERROR("error reading magic page addresses");
- goto out;
- }
-
- /* These comms pages need to be zeroed at the start of day */
- if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
- xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
- xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
- {
- ERROR("error zeroing magic pages");
- goto out;
- }
-
- if ( (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_STORE_PFN, magic_pfns[2]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_PAE_ENABLED, pae))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_STORE_EVTCHN,
- store_evtchn)) )
- {
- ERROR("error setting HVM params: %i", frc);
- goto out;
- }
- *store_mfn = magic_pfns[2];
-
- /* Read HVM context */
- if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
- {
- ERROR("error read hvm context size!\n");
- goto out;
- }
-
- hvm_buf = malloc(rec_len);
- if ( hvm_buf == NULL )
- {
- ERROR("memory alloc for hvm context buffer failed");
- errno = ENOMEM;
- goto out;
- }
-
- if ( read_exact(io_fd, hvm_buf, rec_len) )
- {
- ERROR("error loading the HVM context");
- goto out;
- }
-
- frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
- if ( frc )
- {
- ERROR("error setting the HVM context");
- goto out;
- }
-
- /* HVM success! */
- rc = 0;
- goto out;
- }
-
- /* Non-HVM guests only from here on */
-
if ( !completed ) {
int flags = 0;
@@ -1407,6 +1550,7 @@ int xc_domain_restore(int xc_handle, int
goto finish;
}
memset(&tmptail, 0, sizeof(tmptail));
+ tmptail.ishvm = hvm;
if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
ext_vcpucontext) < 0 ) {
ERROR ("error buffering image tail, finishing");
@@ -1418,6 +1562,8 @@ int xc_domain_restore(int xc_handle, int
goto loadpages;
finish:
+ if ( hvm )
+ goto finish_hvm;
if ( (pt_levels == 3) && !pae_extended_cr3 )
{
@@ -1589,15 +1735,15 @@ int xc_domain_restore(int xc_handle, int
{
int nr_frees = 0;
- for ( i = 0; i < tailbuf.pfncount; i++ )
- {
- unsigned long pfn = tailbuf.pfntab[i];
+ for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
+ {
+ unsigned long pfn = tailbuf.u.pv.pfntab[i];
if ( p2m[pfn] != INVALID_P2M_ENTRY )
{
/* pfn is not in physmap now, but was at some point during
the save/migration process - need to free it */
- tailbuf.pfntab[nr_frees++] = p2m[pfn];
+ tailbuf.u.pv.pfntab[nr_frees++] = p2m[pfn];
p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
}
}
@@ -1609,7 +1755,7 @@ int xc_domain_restore(int xc_handle, int
.extent_order = 0,
.domid = dom
};
- set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
+ set_xen_guest_handle(reservation.extent_start,
tailbuf.u.pv.pfntab);
if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
&reservation)) != nr_frees )
@@ -1618,7 +1764,7 @@ int xc_domain_restore(int xc_handle, int
goto out;
}
else
- DPRINTF("Decreased reservation by %d pages\n",
tailbuf.pfncount);
+ DPRINTF("Decreased reservation by %d pages\n",
tailbuf.u.pv.pfncount);
}
}
@@ -1628,7 +1774,7 @@ int xc_domain_restore(int xc_handle, int
return 1;
}
- vcpup = tailbuf.vcpubuf;
+ vcpup = tailbuf.u.pv.vcpubuf;
for ( i = 0; i <= max_vcpu_id; i++ )
{
if ( !(vcpumap & (1ULL << i)) )
@@ -1755,7 +1901,7 @@ int xc_domain_restore(int xc_handle, int
}
}
- memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+ memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
DPRINTF("Completed checkpoint load\n");
@@ -1812,6 +1958,51 @@ int xc_domain_restore(int xc_handle, int
DPRINTF("Domain ready to be built.\n");
rc = 0;
+ goto out;
+
+ finish_hvm:
+ /* Dump the QEMU state to a state file for QEMU to load */
+ if ( dump_qemu(dom, &tailbuf.u.hvm) ) {
+ ERROR("Error dumping QEMU state to file");
+ goto out;
+ }
+
+ /* These comms pages need to be zeroed at the start of day */
+ if ( xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[0]) ||
+ xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[1]) ||
+ xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[2]) )
+ {
+ ERROR("error zeroing magic pages");
+ goto out;
+ }
+
+ if ( (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_IOREQ_PFN,
tailbuf.u.hvm.magicpfns[0]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_BUFIOREQ_PFN,
tailbuf.u.hvm.magicpfns[1]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_PFN,
tailbuf.u.hvm.magicpfns[2]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_PAE_ENABLED, pae))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_EVTCHN,
+ store_evtchn)) )
+ {
+ ERROR("error setting HVM params: %i", frc);
+ goto out;
+ }
+ *store_mfn = tailbuf.u.hvm.magicpfns[2];
+
+ frc = xc_domain_hvm_setcontext(xc_handle, dom, tailbuf.u.hvm.hvmbuf,
+ tailbuf.u.hvm.reclen);
+ if ( frc )
+ {
+ ERROR("error setting the HVM context");
+ goto out;
+ }
+
+ /* HVM success! */
+ rc = 0;
out:
if ( (rc != 0) && (dom != 0) )
@@ -1819,7 +2010,7 @@ int xc_domain_restore(int xc_handle, int
free(mmu);
free(p2m);
free(pfn_type);
- free(hvm_buf);
+ tailbuf_free(&tailbuf);
/* discard cache for save file */
discard_file_cache(io_fd, 1 /*flush*/);
diff -r 01c9cb566f61 -r f6091947ffed tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Mon Nov 09 19:17:22 2009 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py Mon Nov 09 19:19:27 2009 +0000
@@ -323,26 +323,6 @@ def restore(xd, fd, dominfo = None, paus
if not is_hvm and handler.console_mfn is None:
raise XendError('Could not read console MFN')
- # get qemu state and create a tmp file for dm restore
- # Even PV guests may have QEMU stat, but its not currently
- # used so only bother with HVM currently.
- if is_hvm:
- qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
- "invalid device model signature read")
- if qemu_signature != QEMU_SIGNATURE:
- raise XendError("not a valid device model state: found '%s'" %
- qemu_signature)
- qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(),
- os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
- while True:
- buf = os.read(fd, dm_batch)
- if len(buf):
- write_exact(qemu_fd, buf,
- "could not write dm state to tmp file")
- else:
- break
- os.close(qemu_fd)
-
restore_image.setCpuid()
# xc_restore will wait for source to close connection
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|