# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1169478932 0
# Node ID 207523704fb15ae92b1852bb7e1f0e739ed01fb3
# Parent baa9b76ea3e1de27dbe46ba9b3fb117e09637518
Implement clean return from save/restore failure (so that original
domain can continue execution).
Signed-off-by: Andrei Petrov <andrei.petrov@xxxxxxxxxxxxx>
---
tools/libxc/xc_resume.c | 156 +++++++++++++++++++++++++++++---
tools/libxc/xg_save_restore.h | 9 -
tools/python/xen/lowlevel/xc/xc.c | 4
tools/python/xen/xend/XendCheckpoint.py | 24 ++++
tools/python/xen/xend/XendDomain.py | 1
tools/python/xen/xend/XendDomainInfo.py | 29 +++++
6 files changed, 200 insertions(+), 23 deletions(-)
diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xc_resume.c Mon Jan 22 15:15:32 2007 +0000
@@ -1,5 +1,6 @@
#include "xc_private.h"
-
+#include "xg_private.h"
+#include "xg_save_restore.h"
#if defined(__i386__) || defined(__x86_64__)
static int modify_returncode(int xc_handle, uint32_t domid)
@@ -22,19 +23,7 @@ static int modify_returncode(int xc_hand
}
#endif
-
-/*
- * Resume execution of a domain after suspend shutdown.
- * This can happen in one of two ways:
- * 1. Resume with special return code.
- * 2. Reset guest environment so it believes it is resumed in a new
- * domain context.
- * (2) should be used only for guests which cannot handle the special
- * new return code. (1) is always safe (but slower).
- *
- * XXX Only (2) is implemented below. We need to use (1) by default!
- */
-int xc_domain_resume(int xc_handle, uint32_t domid)
+static int xc_domain_resume_cooperative(int xc_handle, uint32_t domid)
{
DECLARE_DOMCTL;
int rc;
@@ -50,3 +39,142 @@ int xc_domain_resume(int xc_handle, uint
domctl.domain = domid;
return do_domctl(xc_handle, &domctl);
}
+
+static int xc_domain_resume_any(int xc_handle, uint32_t domid)
+{
+ DECLARE_DOMCTL;
+ int i, rc = -1;
+
+ /*
+ * (x86 only) Rewrite store_mfn and console_mfn back to MFN (from PFN).
+ */
+#if defined(__i386__) || defined(__x86_64__)
+ xc_dominfo_t info;
+ unsigned long mfn, max_pfn = 0;
+ vcpu_guest_context_t ctxt;
+ start_info_t *start_info;
+ shared_info_t *shinfo = NULL;
+ xen_pfn_t *p2m_frame_list_list = NULL;
+ xen_pfn_t *p2m_frame_list = NULL;
+ xen_pfn_t *p2m = NULL;
+
+ if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 )
+ {
+ PERROR("Could not get domain info");
+ goto out;
+ }
+
+ /* Map the shared info frame */
+ shinfo = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+ PROT_READ, info.shared_info_frame);
+ if ( shinfo == NULL )
+ {
+ ERROR("Couldn't map shared info");
+ goto out;
+ }
+
+ max_pfn = shinfo->arch.max_pfn;
+
+ p2m_frame_list_list =
+ xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ,
+ shinfo->arch.pfn_to_mfn_frame_list_list);
+ if ( p2m_frame_list_list == NULL )
+ {
+ ERROR("Couldn't map p2m_frame_list_list");
+ goto out;
+ }
+
+ p2m_frame_list = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+ p2m_frame_list_list,
+ P2M_FLL_ENTRIES);
+ if ( p2m_frame_list == NULL )
+ {
+ ERROR("Couldn't map p2m_frame_list");
+ goto out;
+ }
+
+ /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+ the guest must not change which frames are used for this purpose.
+ (its not clear why it would want to change them, and we'll be OK
+ from a safety POV anyhow. */
+ p2m = xc_map_foreign_batch(xc_handle, domid, PROT_READ,
+ p2m_frame_list,
+ P2M_FL_ENTRIES);
+ if ( p2m == NULL )
+ {
+ ERROR("Couldn't map p2m table");
+ goto out;
+ }
+
+ if ( lock_pages(&ctxt, sizeof(ctxt)) )
+ {
+ ERROR("Unable to lock ctxt");
+ goto out;
+ }
+
+ if ( xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt) )
+ {
+ ERROR("Could not get vcpu context");
+ goto out;
+ }
+
+ mfn = ctxt.user_regs.edx;
+
+ start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
+ PROT_READ | PROT_WRITE, mfn);
+ if ( start_info == NULL )
+ {
+ ERROR("Couldn't map start_info");
+ goto out;
+ }
+
+ start_info->store_mfn = p2m[start_info->store_mfn];
+ start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
+
+ munmap(start_info, PAGE_SIZE);
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+ /* Reset all secondary CPU states. */
+ for ( i = 1; i <= info.max_vcpu_id; i++ )
+ xc_vcpu_setcontext(xc_handle, domid, i, NULL);
+
+ /* Ready to resume domain execution now. */
+ domctl.cmd = XEN_DOMCTL_resumedomain;
+ domctl.domain = domid;
+ rc = do_domctl(xc_handle, &domctl);
+
+#if defined(__i386__) || defined(__x86_64__)
+ out:
+ unlock_pages((void *)&ctxt, sizeof ctxt);
+ if (p2m)
+ munmap(p2m, P2M_FL_ENTRIES*PAGE_SIZE);
+ if (p2m_frame_list)
+ munmap(p2m_frame_list, P2M_FLL_ENTRIES*PAGE_SIZE);
+ if (p2m_frame_list_list)
+ munmap(p2m_frame_list_list, PAGE_SIZE);
+ if (shinfo)
+ munmap(shinfo, PAGE_SIZE);
+#endif
+
+ return rc;
+}
+
+/*
+ * Resume execution of a domain after suspend shutdown.
+ * This can happen in one of two ways:
+ * 1. Resume with special return code.
+ * 2. Reset guest environment so it believes it is resumed in a new
+ * domain context.
+ * (2) should be used only for guests which cannot handle the special
+ * new return code. (1) is always safe (but slower).
+ */
+int xc_domain_resume(int xc_handle, uint32_t domid)
+{
+ /*
+ * XXX: Implement a way to select between options (1) and (2).
+ * Or expose the options as two different methods to Python.
+ */
+ return (0
+ ? xc_domain_resume_cooperative(xc_handle, domid)
+ : xc_domain_resume_any(xc_handle, domid));
+}
diff -r baa9b76ea3e1 -r 207523704fb1 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/libxc/xg_save_restore.h Mon Jan 22 15:15:32 2007 +0000
@@ -34,11 +34,10 @@
**
** Returns 1 on success, 0 on failure.
*/
-static int get_platform_info(int xc_handle, uint32_t dom,
- /* OUT */ unsigned long *max_mfn,
- /* OUT */ unsigned long *hvirt_start,
- /* OUT */ unsigned int *pt_levels)
-
+static inline int get_platform_info(int xc_handle, uint32_t dom,
+ /* OUT */ unsigned long *max_mfn,
+ /* OUT */ unsigned long *hvirt_start,
+ /* OUT */ unsigned int *pt_levels)
{
xen_capabilities_info_t xen_caps = "";
xen_platform_parameters_t xen_params;
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Mon Jan 22 15:15:32 2007 +0000
@@ -1064,9 +1064,9 @@ static PyMethodDef pyxc_methods[] = {
"Destroy a domain.\n"
" dom [int]: Identifier of domain to be destroyed.\n\n"
"Returns: [int] 0 on success; -1 on error.\n" },
-
+
{ "domain_resume",
- (PyCFunction)pyxc_domain_resume,
+ (PyCFunction)pyxc_domain_resume,
METH_VARARGS, "\n"
"Resume execution of a suspended domain.\n"
" dom [int]: Identifier of domain to be resumed.\n\n"
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py Mon Jan 22 15:15:32 2007 +0000
@@ -122,6 +122,8 @@ def save(fd, dominfo, network, live, dst
os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
dominfo.destroyDomain()
+ dominfo.testDeviceComplete()
+
try:
dominfo.setName(domain_name)
except VmError:
@@ -134,11 +136,31 @@ def save(fd, dominfo, network, live, dst
except Exception, exn:
log.exception("Save failed on domain %s (%s).", domain_name,
dominfo.getDomid())
+
+ dominfo._releaseDevices()
+ dominfo.testDeviceComplete()
+ dominfo.testvifsComplete()
+ log.debug("XendCheckpoint.save: devices released")
+
+ dominfo._resetChannels()
+
+ dominfo._removeDom('control/shutdown')
+ dominfo._removeDom('device-misc/vif/nextDeviceID')
+
+ dominfo._createChannels()
+ dominfo._introduceDomain()
+ dominfo._storeDomDetails()
+
+ dominfo._createDevices()
+ log.debug("XendCheckpoint.save: devices created")
+
+ dominfo.resumeDomain()
+ log.debug("XendCheckpoint.save: resumeDomain")
+
try:
dominfo.setName(domain_name)
except:
log.exception("Failed to reset the migrating domain's name")
- raise Exception, exn
def restore(xd, fd, dominfo = None, paused = False):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomain.py Mon Jan 22 15:15:32 2007 +0000
@@ -1166,7 +1166,6 @@ class XendDomain:
sock.send("receive\n")
sock.recv(80)
XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst)
- dominfo.testDeviceComplete()
sock.close()
def domain_save(self, domid, dst):
diff -r baa9b76ea3e1 -r 207523704fb1 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Mon Jan 22 14:13:26 2007 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py Mon Jan 22 15:15:32 2007 +0000
@@ -1580,6 +1580,16 @@ class XendDomainInfo:
log.exception("Exception in alloc_unbound(%d)", self.domid)
raise
+ def _resetChannels(self):
+ """Reset all event channels in the domain.
+ """
+ try:
+ return xc.evtchn_reset(dom=self.domid)
+ except:
+ log.exception("Exception in evtcnh_reset(%d)", self.domid)
+ raise
+
+
#
# Bootloader configuration
#
@@ -1727,6 +1737,25 @@ class XendDomainInfo:
test = 0
diff = time.time() - start
for i in self.getDeviceController('vbd').deviceIDs():
+ test = 1
+ log.info("Dev %s still active, looping...", i)
+ time.sleep(0.1)
+
+ if test == 0:
+ break
+ if diff >= MIGRATE_TIMEOUT:
+ log.info("Dev still active but hit max loop timeout")
+ break
+
+ def testvifsComplete(self):
+ """ In case vifs are released and then created for the same
+ domain, we need to wait the device shut down.
+ """
+ start = time.time()
+ while True:
+ test = 0
+ diff = time.time() - start
+ for i in self.getDeviceController('vif').deviceIDs():
test = 1
log.info("Dev %s still active, looping...", i)
time.sleep(0.1)
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|