This is an updated version of the following patch. Followed the
changes in live migration code.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00374.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@xxxxxxxxxxxxx>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@xxxxxxxxxxxxx>
---
tools/libxc/xc_dom_kemari_save.c | 1139 +++++++++++++++++++++++++++++++++++++++
tools/xcutils/xc_kemari_save.c | 518 +++++++++++++++++
2 files changed, 1657 insertions(+)
diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,518 @@
+/*
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static pid_t qemu_pid;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm.
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm
+ * through the store that it wants to share the bitmaps. qemu-dm then
+ * starts filling in the 'active' buffer.
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer. xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+ if (qemu_shmid != -1)
+ shmctl(qemu_shmid, IPC_RMID, NULL);
+ qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+ kemari_qemu_info[0] = next_active;
+ kemari_qemu_info[1] = 0;
+ xen_wmb();
+ kill(qemu_pid, SIGUSR1);
+}
+
+static void qemu_end_flip(void)
+{
+ while (kemari_qemu_info[1] == 0)
+ xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+ while (kemari_qemu_info[2] == 0)
+ xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+ /* after QEMU image sent */
+ kemari_qemu_info[2] = 0;
+ xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+ key_t key;
+ char key_ascii[17] = {0,};
+ void *seg;
+ char *path, *p;
+
+ /* Make a shared-memory segment */
+ do {
+ key = rand(); /* No security, just a sequence of numbers */
+ qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE,
+ IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+ if (qemu_shmid == -1 && errno != EEXIST)
+ errx(1, "can't get shmem to talk to qemu-dm");
+ } while (qemu_shmid == -1);
+
+ /* Remember to tidy up after ourselves */
+ atexit(qemu_destroy_buffer);
+
+ /* Map it into our address space */
+ seg = shmat(qemu_shmid, NULL, 0);
+ if (seg == (void *) -1)
+ errx(1, "can't map shmem to talk to qemu-dm");
+ memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+ /* Write the size of it into the first 32 bits */
+ *(uint32_t *)seg = bitmap_size;
+
+ /* Tell qemu about it */
+ if ((xs = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+ if (!(path = strdup("/local/domain/0/device-model/")))
+ errx(1, "can't get domain path in store");
+ if (!(path = realloc(path, strlen(path)
+ + 10
+ + strlen("/logdirty/next-active") + 1)))
+ errx(1, "no memory for constructing xenstore path");
+ snprintf(path + strlen(path), 11, "%i", domid);
+ strcat(path, "/logdirty/");
+ p = path + strlen(path);
+
+ strcpy(p, "key");
+ snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+ if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+ errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+ /* Watch for qemu's indication of the active buffer, and request it
+ * to start writing to buffer 0 */
+ strcpy(p, "active");
+ if (!xs_watch(xs, path, "qemu-active-buffer"))
+ errx(1, "can't set watch in store (%s)\n", path);
+ if (!(qemu_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ strcpy(p, "next-active");
+ if (!(qemu_next_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ kemari_qemu_info = seg + 2 * bitmap_size;
+ xen_wmb();
+ qemu_save_image(0);
+
+ free(path);
+ return seg;
+}
+
+static void close_handler(int sig_type)
+{
+ run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+ int ret = 1, rcv_port;
+
+ if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+ ERROR("Failed to read from event fd");
+ goto out;
+ }
+
+ if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+ qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+ xc_domain_pause(xc_handle, domid);
+ kill(qemu_pid, SIGSTOP);
+ ERROR("xc_kemari_update failed");
+ goto out;
+ }
+
+ if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+ ERROR("Failed to write to event fd");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+ struct sigaction act;
+
+ act.sa_handler = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGQUIT, &act, 0);
+ sigaction(SIGINT, &act, 0);
+ sigaction(SIGHUP, &act, 0);
+ sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+ struct xs_handle *xs_handle;
+ char **list, *data;
+ unsigned int list_size, data_size;
+ char path[128];
+ uint32_t port;
+ int i, ret = 1;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ /*
+ * attach block port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching blk_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("blk_port %d attached\n", port);
+ }
+ free(list);
+
+ /*
+ * attach net port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching net_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("net_port %d attached\n", port);
+ }
+ free(list);
+
+ /* attach success */
+ ret = 0;
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return ret;
+}
+
+static pid_t get_qemu_pid(int domid)
+{
+ struct xs_handle *xs_handle;
+ char path[128];
+ char *data;
+ unsigned int data_size;
+ pid_t pid = 0;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/image/device-model-pid", domid);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL) {
+ ERROR("Could not find QEMU pid for domid %d", domid);
+ goto out;
+ }
+ pid = strtoul(data, NULL, 10);
+ free(data);
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return pid;
+}
+
+static void finalize(void)
+{
+ int ret;
+
+ if (is_finalized)
+ return;
+
+ set_signal_handler(SIG_IGN);
+ if (ring != NULL)
+ munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+ NULL, NULL, NULL, 0)) != 0) {
+ ERROR("Error when turning off kemari");
+ } else {
+ DPRINTF("successufully execute KEMARI_OP_off\n");
+ }
+
+ if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0 ) {
+ ERROR("Warning - couldn't disable shadow mode");
+ }
+
+ if (!run)
+ xc_domain_destroy(xc_handle, domid);
+
+ xc_interface_close(xc_handle);
+
+ is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+ unsigned int maxit, max_f, flags;
+ int ret;
+ int evtchn_fd;
+ uint32_t port, kemari_port;
+ uint64_t kemari_mfn;
+ fd_set inset;
+
+ if (argc != 6)
+ errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+ xc_handle = xc_interface_open();
+ if (xc_handle < 0)
+ errx(1, "failed to open control interface");
+
+ io_fd = atoi(argv[1]);
+ domid = atoi(argv[2]);
+ maxit = atoi(argv[3]);
+ max_f = atoi(argv[4]);
+ flags = atoi(argv[5]);
+
+ set_signal_handler(close_handler);
+ if ((qemu_pid = get_qemu_pid(domid)) == 0)
+ errx(1, "failed to get qemu pid");
+ atexit(finalize);
+
+ if (io_fd == -1) /* means test mode */
+ {
+ io_fd = open("/dev/null", O_RDWR);
+ flags |= XCFLAGS_DEBUG;
+ }
+ else
+ {
+ int one = 1;
+ if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY,
+ &one, sizeof(one)) < 0) {
+ ERROR("failed to set TCP_NODELAY");
+ }
+ }
+
+ if ((xce_handle = xc_evtchn_open()) < 0) {
+ errx(1, "failed to open control interface");
+ }
+
+ evtchn_fd = xc_evtchn_fd(xce_handle);
+
+ if ( xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ {
+ int frc;
+ /* log-dirty already enabled? There's no test op,
+ so attempt to disable then reenable it */
+ frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL);
+ if ( frc >= 0 )
+ {
+ frc = xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL);
+ }
+
+ if ( frc < 0 )
+ {
+ err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+ }
+ }
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable,
+ &kemari_port, &kemari_ring_size,
+ &kemari_mfn, 0) != 0)) {
+ errx(1, "Error when turning on kemari");
+ }
+
+ DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n",
+ kemari_port, kemari_mfn, kemari_ring_size);
+
+ if (attach_ports(domid) != 0) {
+ ERROR("attaching port failed ");
+ goto out;
+ }
+
+ if ((port = xc_evtchn_bind_interdomain(xce_handle, domid,
+ kemari_port)) < 0) {
+ ERROR("xc_evtchn_bind_interdomain failed ");
+ goto out;
+ }
+
+ if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN,
+ kemari_ring_size * PAGE_SIZE, PROT_READ |
PROT_WRITE,
+ kemari_mfn)) == 0) {
+ ERROR("xc_map_foreign_range failed");
+ goto out;
+ }
+
+ if (xc_domain_pause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have paused");
+ goto out;
+ }
+
+ ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags,
+ !!(flags & XCFLAGS_HVM),
+ &init_qemu_maps);
+ if (ret != 0) {
+ ERROR("xc_kemari_save failed");
+ goto out;
+ }
+
+ FD_ZERO(&inset);
+ FD_SET(evtchn_fd, &inset);
+
+ if (xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have unpaused");
+ goto out;
+ }
+
+ DPRINTF("running start");
+
+ while (run) {
+
+ if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+ if (errno == EINTR)
+ continue;
+ ERROR("Error when waiting events by select()");
+ break;
+ }
+
+ if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+ if ((ret = handle_event(domid, flags)) != 0) {
+ ERROR("Error when handling events");
+ break;
+ }
+
+ /* usleep(10000); */
+
+ if (xc_evtchn_notify(xce_handle, port) < 0) {
+ ERROR("xc_evtchn_notify failed");
+ /* goto out; */
+ break;
+ }
+
+ if(xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("xc_domain_unpause");
+ /* goto out; */
+ break;
+ }
+
+ }
+ }
+
+ out:
+ close(io_fd);
+ finalize();
+
+ return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,1139 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef __MINIOS__
+/*
+ * Caution: atomicity of following alternative libc functions are broken.
+ */
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ char buf[1024];
+ int len, wrote_len = 0;
+
+ if (offset != NULL) {
+ ERROR("Sorry sendfile for stubdomain should not have offset");
+ errno = EIO;
+ return -1;
+ }
+
+ while (count > 0) {
+ len = (count < sizeof(buf))?count:sizeof(buf);
+ len = read(in_fd, buf, len);
+ if (len < 0)
+ return -1;
+ if (write_exact(out_fd, buf, len))
+ return -1;
+ wrote_len += len;
+ count -= len;
+ }
+ return wrote_len;
+}
+
+#define IOV_MAX 1024
+struct iovec {
+ void *iov_base; /* Base address. */
+ size_t iov_len; /* Length. */
+};
+static ssize_t writev(int d, const struct iovec *iov, int iovcnt)
+{
+ int i;
+ int len, wrote_len;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ for (i = 0, wrote_len = 0; i < iovcnt; i++) {
+ len = write(d, iov[i].iov_base, iov[i].iov_len);
+ if (len < 0)
+ return -1;
+
+ wrote_len += len;
+ if (wrote_len < 0) { /* integer overflow */
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (len != iov[i].iov_len)
+ return wrote_len;
+ }
+
+ return wrote_len;
+}
+#else /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#include <sys/uio.h>
+#endif /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+/*
+ * guest memory
+ */
+#define GUEST_MEM_ENTRY_SIZE 1024 /* up to 4MB at a time. */
+static unsigned char ** guest_memory = NULL;
+static unsigned long ** guest_memory_status = NULL;
+static unsigned long guest_memory_size = 0;
+
+static inline int map_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long base)
+{
+ int j;
+ unsigned char * region_base;
+ unsigned long * pfn_base;
+
+ pfn_base = guest_memory_status[base];
+
+ memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE);
+ for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) {
+ pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j;
+ }
+ region_base = xc_map_foreign_batch(
+ xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE);
+ if ( region_base == NULL )
+ {
+ PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)",
+ base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1,
+ base);
+ return -1;
+ }
+
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK;
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == GUEST_MEM_ENTRY_SIZE )
+ {
+ munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ return 1;
+ }
+
+ guest_memory[base] = region_base;
+
+ return 0;
+}
+
+static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long mfn)
+{
+ unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE;
+ unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE;
+
+ if (base >= guest_memory_size) {
+ ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n",
+ base, guest_memory_size);
+ return NULL;
+ }
+
+ if ( guest_memory_status[base][offset] == XEN_DOMCTL_PFINFO_XTAB ) {
+ /* reload XTAB place */
+ munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n",
+ base, base * GUEST_MEM_ENTRY_SIZE,
+ (base + 1) * GUEST_MEM_ENTRY_SIZE - 1);
+ }
+
+ if (guest_memory[base] == NULL)
+ if (map_guest_mem(xc_handle, domid, offset))
+ return NULL;
+
+ return guest_memory[base] + offset * PAGE_SIZE;
+ /* Since I don't care of XEN_DOMCTL_PFINFO_LTAB_MASK,
+ this program may cause some accidents. */
+}
+
+static inline int init_guest_mem(int xc_handle, uint32_t dom)
+{
+ int i;
+
+ guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1;
+ DPRINTF("guest_memory_size: %lu\n", guest_memory_size);
+
+ /* mapped memory */
+ guest_memory = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory[0]));
+ if (guest_memory == NULL)
+ {
+ PERROR("failed to allocate guest_memory");
+ return -1;
+ }
+ if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0])))
+ {
+ ERROR("Unable to lock guest_memory array");
+ return -1;
+ }
+
+ /* memory status */
+ guest_memory_status = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory_status[0]));
+ if ( guest_memory_status == NULL )
+ {
+ ERROR("failed to alloc memory for guest_memory_status");
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0])))
+ {
+ ERROR("Unable to lock guest_memory_status array");
+ return -1;
+ }
+
+ for (i = 0; i < guest_memory_size; i++) {
+ guest_memory_status[i] = xg_memalign(PAGE_SIZE,
+ GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0]));
+ if (guest_memory_status[i] == NULL) {
+ ERROR("failed to alloc memory for guest_memory_status[%d]", i);
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0][0])))
+ {
+ ERROR("Unable to lock guest_memory_status[%d]", i);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < guest_memory_size; i++)
+ if (map_guest_mem(xc_handle, dom, i) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int writev_exact(int fd, const struct iovec *iov, size_t count)
+{
+ int i;
+ size_t sum;
+ for (i = 0, sum = 0; i < count; i++)
+ sum += iov[i].iov_len;
+
+ if (writev(fd, iov, count) != sum)
+ return -1;
+ else
+ return 0;
+}
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+ return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+ return (((new->tv_sec - old->tv_sec)*1000000) +
+ (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len)
+{
+ static int write_count = 0;
+ int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+ write_count += len;
+ if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+ {
+ /* Time to discard cache - dont care if this fails */
+ discard_file_cache(fd, 0 /* no flush */);
+ write_count = 0;
+ }
+
+ return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
+#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU 781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+ mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+ static int budget = 0;
+ static int burst_time_us = -1;
+ static struct timeval last_put = { 0 };
+ struct timeval now;
+ struct timespec delay;
+ long long delta;
+
+ if ( START_MBIT_RATE == 0 )
+ return noncached_write(io_fd, buf, n);
+
+ budget -= n;
+ if ( budget < 0 )
+ {
+ if ( mbit_rate != ombit_rate )
+ {
+ burst_time_us = RATE_TO_BTU / mbit_rate;
+ ombit_rate = mbit_rate;
+ DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+ mbit_rate, BURST_BUDGET, burst_time_us);
+ }
+ if ( last_put.tv_sec == 0 )
+ {
+ budget += BURST_BUDGET;
+ gettimeofday(&last_put, NULL);
+ }
+ else
+ {
+ while ( budget < 0 )
+ {
+ gettimeofday(&now, NULL);
+ delta = tv_delta(&now, &last_put);
+ while ( delta > burst_time_us )
+ {
+ budget += BURST_BUDGET;
+ last_put.tv_usec += burst_time_us;
+ if ( last_put.tv_usec > 1000000 )
+ {
+ last_put.tv_usec -= 1000000;
+ last_put.tv_sec++;
+ }
+ delta -= burst_time_us;
+ }
+ if ( budget > 0 )
+ break;
+ delay.tv_sec = 0;
+ delay.tv_nsec = 1000 * (burst_time_us - delta);
+ while ( delay.tv_nsec > 0 )
+ if ( nanosleep(&delay, &delay) == 0 )
+ break;
+ }
+ }
+ }
+ return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+ xc_shadow_op_stats_t *stats, int print)
+{
+ static struct timeval wall_last;
+ static long long d0_cpu_last;
+ static long long d1_cpu_last;
+
+ struct timeval wall_now;
+ long long wall_delta;
+ long long d0_cpu_now, d0_cpu_delta;
+ long long d1_cpu_now, d1_cpu_delta;
+
+ gettimeofday(&wall_now, NULL);
+
+ d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+ d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+ if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+ DPRINTF("ARRHHH!!\n");
+
+ wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+ if ( wall_delta == 0 )
+ wall_delta = 1;
+
+ d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+ d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+ if ( print )
+ DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+ "dirtied %dMb/s %" PRId32 " pages\n",
+ wall_delta,
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+ (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+ stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+ if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+ {
+ mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+ + 50;
+ if ( mbit_rate > MAX_MBIT_RATE )
+ mbit_rate = MAX_MBIT_RATE;
+ }
+#endif
+
+ d0_cpu_last = d0_cpu_now;
+ d1_cpu_last = d1_cpu_now;
+ wall_last = wall_now;
+
+ return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+ char path[128];
+ struct stat st;
+ struct {
+ int minusfour;
+ uint32_t image_size;
+ } chunk = { -1, 0 };
+ int qemu_fd;
+ int rc = -1;
+
+ snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+ if ((qemu_fd = open(path, O_RDONLY)) == -1)
+ {
+ PERROR("Error when opening qemu image %s", path);
+ goto out;
+ }
+
+ if (fstat(qemu_fd, &st) == -1)
+ {
+ PERROR("Error fstat qemu file %s", path);
+ goto out;
+ }
+ chunk.image_size = st.st_size;
+
+ if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing header for qemu image");
+ goto out;
+ }
+
+ if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !=
+ chunk.image_size)
+ {
+ PERROR("Error when writing qemu image");
+ goto out;
+ }
+ close(qemu_fd);
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom)
+{
+ struct {
+ int id;
+ uint32_t pad;
+ uint64_t data;
+ } chunk = { 0, 0 };
+
+ chunk.id = -3;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the ident_pt for EPT guest");
+ return -1;
+ }
+
+ chunk.id = -4;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the vm86 TSS for guest");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd,
+ struct kemari_ring *ring, uint32_t dom)
+{
+ uint32_t buf_size = ring->hvm_ctxt.buf_size;
+ uint32_t rec_size = ring->hvm_ctxt.rec_size;
+ uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+ int rc = -1;
+
+ /* Get HVM context from Xen and save it too */
+ if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+ buf_size)) == -1 )
+ {
+ ERROR("HVM:Could not get hvm buffer");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ PERROR("error write hvm buffer size");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, hvm_buf, rec_size) )
+ {
+ PERROR("write HVM info failed!\n");
+ goto out;
+ }
+ rc = 0;
+
+out:
+ return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+ int rc = 1, i, j, iter = 0;
+ int debug = (flags & XCFLAGS_DEBUG);
+ int sent_last_iter, skip_this_iter;
+ xc_dominfo_t info;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+
+ /* base of the region in which domain memory is mapped */
+ unsigned char *region_base = NULL;
+
+ /* bitmap of pages:
+ - that should be sent this iteration (unless later marked as skip);
+ - to skip this iteration because already dirty;
+ - to fixup by sending at the end if not already resent; */
+ unsigned long *to_send = NULL, *to_fix = NULL;
+
+ xc_shadow_op_stats_t stats;
+
+ unsigned long needed_to_fix = 0;
+ unsigned long total_sent = 0;
+
+ /* HVM: magic frames for ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* callback irq */
+ uint64_t callback_irq = 0;
+
+ if ( !hvm )
+ {
+ ERROR("HVM domain is required for the kemari migration.");
+ return 1;
+ }
+
+ initialize_mbit_rate();
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return 1;
+ }
+
+ shared_info_frame = info.shared_info_frame;
+ DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+ /* Get the size of the P2M table */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+ DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+ /* Domain is still running at this point */
+ {
+ /* Get qemu-dm logging dirty pages too */
+ void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+ qemu_bitmaps[0] = seg;
+ qemu_bitmaps[1] = seg + BITMAP_SIZE;
+ qemu_active = 0;
+ qemu_non_active = 1;
+ }
+
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = p2m_size;
+
+ /* Setup to_send / to_fix bitmaps */
+ to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
+ to_fix = calloc(1, BITMAP_SIZE);
+
+ if ( !to_send || !to_fix )
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+
+ memset(to_send, 0xff, BITMAP_SIZE);
+
+ if ( lock_pages(to_send, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_send");
+ return 1;
+ }
+
+ pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
+ MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+ if ( pfn_type == NULL )
+ {
+ ERROR("failed to alloc memory for pfn_type arrays");
+ errno = ENOMEM;
+ goto out;
+ }
+ memset(pfn_type, 0,
+ ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+ if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+ {
+ ERROR("Unable to lock pfn_type array");
+ goto out;
+ }
+
+ /* Start writing out the saved-domain record. */
+ if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ PERROR("write: p2m_size");
+ goto out;
+ }
+
+ /* send shared_info_frame */
+ if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+ {
+ PERROR("write: shared_info_frame");
+ goto out;
+ }
+
+ /* Save magic-page locations. */
+ memset(magic_pfns, 0, sizeof(magic_pfns));
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ &magic_pfns[0]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ &magic_pfns[1]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ &magic_pfns[2]);
+ DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+ magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+ if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ PERROR("Error when writing to state file (7)");
+ goto out;
+ }
+
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+ &callback_irq);
+ DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+ if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+ {
+ PERROR("Error when writing to state file (8)");
+ goto out;
+ }
+
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ /* Now write out each data page, canonicalising page tables as we go... */
+ {
+ unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+ iter++;
+ sent_this_iter = 0;
+ skip_this_iter = 0;
+ prev_pc = 0;
+ N = 0;
+
+ DPRINTF("Saving memory pages: iter %d 0%%", iter);
+
+ while ( N < p2m_size )
+ {
+ unsigned int this_pc = (N * 100) / p2m_size;
+
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ DPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
+ for ( batch = 0;
+ (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+ N++ )
+ {
+ int n = N;
+
+ if ( debug )
+ {
+ DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+ iter, (unsigned long)n,
+ (long unsigned int)0,
+ test_bit(n, to_send));
+ DPRINTF("\n");
+ }
+
+ if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+ continue;
+
+#if 0
+ /* Skip PFNs that aren't really there */
+ if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */
) {
+ if (n >= shared_info_frame && n <= shared_info_frame + 32)
{
+ /* DPRINTF("shared_info_frame or grant: %d\n", n); */
+ } else {
+ continue;
+ }
+ }
+#endif
+
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. add in pages that still need fixup (net bufs)
+ */
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ pfn_type[batch] = n;
+
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
+
+ batch++;
+ }
+
+ if ( batch == 0 )
+ goto skip; /* vanishingly unlikely... */
+
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ, pfn_type, batch);
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ {
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < batch; j++ )
+ if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+ XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == batch )
+ {
+ munmap(region_base, batch*PAGE_SIZE);
+ continue; /* bail on this batch: no valid pages */
+ }
+ }
+
+ if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ {
+ PERROR("Error when writing to state file (2)");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ {
+ PERROR("Error when writing to state file (3)");
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ run = 0;
+ for ( j = 0; j < batch; j++ )
+ {
+ unsigned long pfn, pagetype;
+
+ pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype != 0 )
+ {
+ /* If the page is not a normal data page, write out any
+ run of pages we may have previously acumulated */
+ if ( run )
+ {
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4a)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ run = 0;
+ }
+ }
+
+ /* skip pages that aren't present */
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ continue;
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ DPRINTF("canonicalize_pagetable pagetype = %lx pfn =
%lu\n", pagetype, pfn);
+ }
+ else
+ {
+ /* We have a normal page: accumulate it for writing. */
+ run++;
+ }
+ } /* end of the write out for this batch */
+
+ if ( run )
+ {
+ /* write out the last accumulated run of pages */
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4c)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ sent_this_iter += batch;
+
+ munmap(region_base, batch*PAGE_SIZE);
+
+ } /* end of this while loop for this iteration */
+
+ skip:
+
+ total_sent += sent_this_iter;
+
+ DPRINTF("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
+
+ {
+ print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+ DPRINTF("Total pages sent= %ld (%.2fx)\n",
+ total_sent, ((float)total_sent)/p2m_size );
+ DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ }
+ } /* end of infinite for loop */
+
+ DPRINTF("All memory is saved\n");
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ DPRINTF("status received: %d\n", rcv_status);
+ }
+
+ if (init_guest_mem(xc_handle, dom) < 0)
+ goto out;
+
+ /* HVM guests are done now */
+ rc = 0;
+
+ out:
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ free(to_send);
+ free(to_fix);
+
+ DPRINTF("Save exit rc=%d\n",rc);
+
+ return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ void (*qemu_save_image)(int),
+ void (*qemu_end_flip)(void),
+ void (*qemu_end_save)(void),
+ void (*qemu_image_sent)(void))
+{
+ int rc = 1, k;
+ int debug = (flags & XCFLAGS_DEBUG);
+ uint32_t i, j, index = 0;
+ unsigned int batch = 0;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+ struct kemari_ent *buf;
+ struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */
+ int iovcnt = 2;
+
+#define ADD_IOV(base, len) do { \
+ iov[iovcnt].iov_base = base; \
+ iov[iovcnt].iov_len = len; \
+ iovcnt++; \
+} while (0)
+
+
+
+ /* flip active qemu */
+ qemu_active = qemu_non_active;
+ qemu_non_active = qemu_active ? 0 : 1;
+ qemu_save_image(qemu_active);
+
+ /*
+ * main iteration starts from here
+ */
+ while (ring->cons < ring->prod) {
+
+ kemari_ring_read(ring, &buf);
+
+ for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+ int next, offset = 0;
+
+ index = i * BITS_PER_LONG;
+
+ kemari_ring_read(ring, &buf);
+
+ while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+ int n;
+ next = ffs(buf->u.dirty_bitmap);
+ buf->u.dirty_bitmap >>= next;
+ offset += next;
+ n = offset + index - 1;
+#if 0
+ if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+ if (n >= shared_info_frame && n <= shared_info_frame + 32)
{
+ ;
+ } else {
+ continue;
+ }
+ }
+#endif
+ ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+
+ if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+ !(ring->cons == ring->prod))
+ continue;
+
+ /* Pull in the dirty bits from qemu-dm too */
+ qemu_end_flip();
+ for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+ if (qemu_bitmaps[qemu_non_active][k] != 0) {
+ unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+ index = k * BITS_PER_LONG;
+ while (bmp && offset < BITS_PER_LONG) {
+ int n, next, offset = 0;
+ next = ffs(bmp);
+ bmp >>= next;
+ offset += next;
+ n = offset + index - 1;
+
+ ADD_IOV(search_guest_mem(xc_handle, dom, n),
PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+ qemu_bitmaps[qemu_non_active][k] = 0;
+ }
+ if (batch >= MAX_BATCH_SIZE) {
+ ERROR("Sorry, reached MAX_BATCH_SIZE. "
+ "We will fix this lator.");
+ goto out;
+ }
+ }
+
+ PPRINTF("batch %d\n", batch);
+
+ /* send pages */
+ iov[0].iov_base = &batch;
+ iov[0].iov_len = sizeof(batch);
+
+ iov[1].iov_base = pfn_type;
+ iov[1].iov_len = sizeof(pfn_type[0]) * batch;
+
+ for (k = 0; k < iovcnt / IOV_MAX + 1; k++) {
+ int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX;
+ if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) {
+ ERROR("Error when writing pages state file (2--4)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ batch = 0;
+ }
+ }
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_end_save();
+ if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_image_sent();
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ }
+
+ rc = 0;
+out:
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|