Hello,
I've been working on VM snapshots/CoW for Xen (as seen at Xen Summit
this year). I'm happy to release my first version.
There are some known issues with capturing page dirties for HVM guests.
I think it's related to the QEMU code (I believe I'm not catching the
pages that QEMU dirties). However, it works with both 32-bit and 64-bit
PV guests.
This release includes modifications to Linux, Xen, and some tools (a
library, FUSE fs, and testing tool).
The FUSE file system will take a snapshot if you try to create a file in
the directory you mount the xencowfs file system too.
The testing tool pauses a domain, enables CoW, takes a dump of the CoW
image, takes two live memory dumps, unpauses the domain for a bit, then
takes another CoW image. It compares all the images and reports on which
pages are different.
These patches are against xen-unstable revision 19425, however I had no
trouble patching them against the current revision of xen-unstable (19553).
Please remember this is an alpha release, so there is likely to be some
problems. Please let me know if you find any!
Patrick
diff -r 832aac894efd drivers/xen/Kconfig
--- a/drivers/xen/Kconfig Wed Nov 19 13:15:46 2008 +0000
+++ b/drivers/xen/Kconfig Mon Mar 16 00:01:12 2009 -0700
@@ -312,4 +312,7 @@
config XEN_DEVMEM
def_bool y
+config XEN_XENCOW
+ def_bool y
+
endif
diff -r 832aac894efd drivers/xen/Makefile
--- a/drivers/xen/Makefile Wed Nov 19 13:15:46 2008 +0000
+++ b/drivers/xen/Makefile Mon Mar 16 00:01:12 2009 -0700
@@ -23,3 +23,4 @@
obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/
obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/
obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) += sfc_netback/
+obj-$(CONFIG_XEN_XENCOW) += xencow/
diff -r 832aac894efd drivers/xen/xencow/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/Makefile Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,2 @@
+
+obj-m := xencow.o
diff -r 832aac894efd drivers/xen/xencow/common.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/common.h Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,74 @@
+/******************************************************************************
+ * common.h
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_XENCOW_COMMON_H__
+#define __XEN_XENCOW_COMMON_H__
+
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <xen/interface/platform.h>
+#include <xen/driver_util.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/xencow.h>
+
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_cow: " fmt, ##args)
+
+
+#endif /* __XEN_XENCOW_COMMON_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 832aac894efd drivers/xen/xencow/xencow.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/xen/xencow/xencow.c Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,246 @@
+/******************************************************************************
+ * xencow.c
+ *
+ * Xen Copy-on-Write Kernel Driver - Initialises CoW buffer for userspace
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include "common.h"
+
+
+static int xencow_major;
+
+
+static void xencow_release_user_pages(struct page *pages[], int num)
+{
+ int i;
+
+ for ( i = 0; i < num; i++ )
+ {
+ struct page *page = pages[i];
+ SetPageUptodate(page);
+ put_page(page);
+ }
+}
+
+static int xencow_get_user_pages(unsigned long addr,
+ int num,
+ struct page *pages[])
+{
+ int ret;
+
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, addr, num, 0, 0, pages, NULL);
+ up_read(¤t->mm->mmap_sem);
+
+ if ( ret != num )
+ {
+ if ( ret >= 0 )
+ {
+ xencow_release_user_pages(pages, ret);
+ ret = -E2BIG;
+ }
+
+ return ret;
+ }
+
+ return 0;
+}
+
+static inline unsigned long xencow_page_to_mfn(struct page *page)
+{
+ unsigned long pfn;
+ unsigned long mfn;
+
+ pfn = page_to_pfn(page);
+ mfn = pfn_to_mfn(pfn);
+
+ return mfn;
+}
+
+static int xencow_get_page_mfns(unsigned long addr,
+ int num,
+ unsigned long mfns[])
+{
+ struct page *pages[num];
+ unsigned long mfn;
+ int ret;
+ int i;
+
+ /* Get user pages */
+ ret = xencow_get_user_pages(addr, num, pages);
+ if ( ret != 0 )
+ return ret;
+
+ /* Get MFNs for the pages */
+ for ( i = 0; i < num; i++ )
+ {
+ mfn = xencow_page_to_mfn(pages[i]);
+ if ( mfn == 0 )
+ return -EFAULT;
+
+ mfns[i] = mfn;
+ }
+
+ /* Return user pages */
+ xencow_release_user_pages(pages, num);
+
+ return 0;
+}
+
+static int xencow_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret = 0;
+
+ switch ( cmd )
+ {
+
+ case XEN_COW_IOCTL_INIT:
+ {
+ xencow_init_t *cow_init;
+ xencow_init_t __user *cow_init_u;
+ int num_mfns;
+ int i;
+
+ cow_init_u = (xencow_init_t __user *)arg;
+
+ /* Check access on user init struct */
+ ret = -EFAULT;
+ if ( !access_ok(VERIFY_READ, cow_init_u, sizeof(xencow_init_t)) )
+ break;
+
+ /* Get the number of frames in the buffer */
+ ret = __get_user(num_mfns, &cow_init_u->num_mfns);
+ if ( ret != 0 )
+ break;
+
+ /* Allocate space */
+ ret = -ENOMEM;
+ cow_init = (xencow_init_t *)
+ kmalloc(sizeof(xencow_init_t)
+ + (num_mfns * sizeof(unsigned long)),
+ GFP_KERNEL);
+ if ( cow_init == NULL )
+ break;
+
+ cow_init->num_mfns = num_mfns;
+
+ /* Get start address of buffer */
+ ret = __get_user(cow_init->addr, &cow_init_u->addr);
+ if ( ret != 0 )
+ goto init_out;
+
+ /* Get page buffer MFNs */
+ ret = xencow_get_page_mfns(cow_init->addr,
+ cow_init->num_mfns,
+ cow_init->mfns);
+ if ( ret != 0 )
+ goto init_out;
+
+ /* Check access on user page buffer MFNs array */
+ ret = -EFAULT;
+ if ( !access_ok(VERIFY_WRITE, &cow_init_u->mfns,
+ cow_init->num_mfns * sizeof(unsigned long)) )
+ goto init_out;
+
+ /* Send page buffer MFNs to user */
+ ret = 0;
+ for ( i = 0; i < cow_init->num_mfns; i++ )
+ ret |= __put_user(cow_init->mfns[i], &cow_init_u->mfns[i]);
+
+ init_out:
+ kfree(cow_init);
+ }
+ break;
+
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+ return ret;
+}
+
+static const struct file_operations xencow_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = xencow_ioctl,
+};
+
+static int __init xencow_init(void)
+{
+ int ret;
+ struct class *class;
+
+ if ( !is_running_on_xen() )
+ return -ENODEV;
+
+ ret = register_chrdev(0, "xencow", &xencow_fops);
+ if ( ret < 0 )
+ {
+ WPRINTK("Couldn't register /dev/xen/xencow\n");
+ return ret;
+ }
+
+ xencow_major = ret;
+
+ DPRINTK("Created misc_dev [/dev/xen/xencow%d]\n", xencow_major);
+
+ /* Make sure the xen class exists */
+ class = get_xen_class();
+ if ( class != NULL )
+ class_device_create(class, NULL, MKDEV(xencow_major, 0),
+ NULL, "xencow0");
+ else
+ /* This is bad, but not fatal */
+ WPRINTK("sysfs xen_class not created\n");
+
+ DPRINTK("XenCoW device successfully created\n");
+
+ return 0;
+}
+
+static void __exit xencow_exit(void)
+{
+ int ret;
+
+ ret = unregister_chrdev(xencow_major, "xencow");
+ if ( ret < 0 )
+ {
+ WPRINTK("Error: Couldn't unregister /dev/xen/xencow: %d\n", ret);
+ return;
+ }
+
+ DPRINTK("XenCoW device successfully removed\n");
+}
+
+module_init(xencow_init);
+module_exit(xencow_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff -r 832aac894efd include/xen/interface/io/xencow.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/include/xen/interface/io/xencow.h Mon Mar 16 00:01:12 2009 -0700
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * xencow.h
+ *
+ * XenCoW Common Structures
+ *
+ * Copyright (C) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#ifndef _XEN_PUBLIC_IO_XENCOW_H
+#define _XEN_PUBLIC_IO_XENCOW_H
+
+
+#include "ring.h"
+
+
+#define XEN_COW_IOC_MAGIC 'w'
+#define XEN_COW_IOCTL_INIT _IO(XEN_COW_IOC_MAGIC, 1)
+
+
+/* Some definitions for the XenCow ring buffer. */
+typedef struct xencow_request_st {
+ ulong mfn;
+} xencow_request_t;
+
+typedef struct xencow_response_st {
+ ulong pfn;
+} xencow_response_t;
+
+DEFINE_RING_TYPES(xencow, xencow_request_t, xencow_response_t);
+
+
+/* The structure used to initialise a XenCoW snapshot. */
+typedef struct xencow_init_st {
+ /* Start address of buffer */
+ unsigned long addr;
+ /* Number of frames in buffer */
+ int num_mfns;
+ /* MFNs of buffer frames */
+ unsigned long mfns[];
+} xencow_init_t;
+
+
+
+#endif /* _XEN_PUBLIC_IO_XENCOW_H */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/Makefile
--- a/tools/Makefile Fri Mar 20 17:42:46 2009 +0000
+++ b/tools/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -26,6 +26,7 @@
SUBDIRS-$(CONFIG_Linux) += fs-back
SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
SUBDIRS-y += xenpmd
+SUBDIRS-y += xencow
# These don't cross-compile
ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
diff -r 0477f9061c8a tools/xencow/COPYING
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/COPYING Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff -r 0477f9061c8a tools/xencow/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,10 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y :=
+SUBDIRS-y += lib
+SUBDIRS-y += xencowfs
+SUBDIRS-y += test
+
+.PHONY: all clean install
+all install clean: %: subdirs-%
diff -r 0477f9061c8a tools/xencow/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/README Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,19 @@
+Xen Copy on Write
+-----------------------
+Provide copy on write functionality for the memory of Xen domains.
+
+
+
+
+Usage Notes and issues
+----------------------
+
+
+Future Work
+-----------
+
+Authors
+-------
+Chris Matthews <cmatthew@xxxxxxxxxx>
+Geoffrey Lifebvre <geoffrey@xxxxxxxxx>
+Brendan Cully <brendan@xxxxxxxxx>
diff -r 0477f9061c8a tools/xencow/lib/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,66 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAKE_LINK=ln -sf
+
+MAJOR = 0
+MINOR = 0
+SONAME = libxencow.so.$(MAJOR)
+
+CFLAGS += -I $(XEN_XC)
+CFLAGS += -I ./
+CFLAGS += $(CFLAGS_libxenctrl)
+LDFLAGS += $(LDFLAGS_libxenctrl)
+
+SRCS :=
+SRCS += xc.c xencow.c
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -fPIC
+CFLAGS += -g
+
+CTRL_LIB_OBJS := $(patsubst %.c,%.o,$(CTRL_SRCS-y))
+CTRL_PIC_OBJS += $(patsubst %.c,%.opic,$(CTRL_SRCS-y))
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+OBJS = $(SRCS:.c=.o)
+OBJS_PIC = $(SRCS:.c=.opic)
+IBINS :=
+
+LIB = libxencow.a libxencow.so.$(MAJOR).$(MINOR)
+
+.PHONY: all
+all: $(LIB)
+
+.PHONY: install
+install: all
+ $(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
+ $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
+ $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
+ $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR)
$(DESTDIR)$(LIBDIR)/libxencow.so.$(MAJOR)
+ $(MAKE_LINK) libxencow.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxencow.so
+ $(INSTALL_DATA) xencow.h $(DESTDIR)$(INCLUDEDIR)
+ $(INSTALL_DATA) xencow_list.h $(DESTDIR)$(INCLUDEDIR)
+
+.PHONY: clean
+clean:
+ rm -rf *.a *.so* *.o *.opic $(LIB) *~ $(DEPS) xen TAGS
+
+libxencow.so.$(MAJOR).$(MINOR): $(OBJS_PIC)
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,$(SONAME_LDFLAG)
-Wl,libxencow.so.$(MAJOR) $(SHLIB_CFLAGS) -o $@ $^
+ $(MAKE_LINK) libxencow.so.$(MAJOR).$(MINOR) libxencow.so.$(MAJOR)
+ $(MAKE_LINK) libxencow.so.$(MAJOR) libxencow.so
+
+libxencow.a: $(OBJS)
+ $(AR) rcs $@ $^
+
+.PHONY: TAGS
+TAGS:
+ etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff -r 0477f9061c8a tools/xencow/lib/xc.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xc.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,357 @@
+/******************************************************************************
+ * tools/xencow/lib/xc.c
+ *
+ * libxc refactorisation. This should be put in libxc ultimately.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include <xg_private.h>
+#include <xg_save_restore.h>
+#include "xc.h"
+
+
+/*
+ * Returns TRUE if the given machine frame number has a unique mapping
+ * in the guest's pseudophysical map.
+ */
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
+ (((_mfn) < (max_mfn)) && \
+ ((mfn_to_pfn(_mfn) < (p2m_size)) && \
+ (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+ unsigned long *m2p_mfn0)
+{
+ struct xen_machphys_mfn_list xmml;
+ privcmd_mmap_entry_t *entries;
+ unsigned long m2p_chunks;
+ unsigned long m2p_size;
+ xen_pfn_t *m2p;
+ xen_pfn_t *extent_start;
+ int i;
+
+ m2p = NULL;
+ m2p_size = M2P_SIZE(max_mfn);
+ m2p_chunks = M2P_CHUNKS(max_mfn);
+
+ xmml.max_extents = m2p_chunks;
+
+ extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
+ if ( !extent_start )
+ {
+ ERROR("failed to allocate space for m2p mfns");
+ goto err0;
+ }
+ set_xen_guest_handle(xmml.extent_start, extent_start);
+
+ if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml)
+ || (xmml.nr_extents != m2p_chunks) )
+ {
+ ERROR("xc_get_m2p_mfns");
+ goto err1;
+ }
+
+ entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
+ if (entries == NULL)
+ {
+ ERROR("failed to allocate space for mmap entries");
+ goto err1;
+ }
+
+ for ( i = 0; i < m2p_chunks; i++ )
+ entries[i].mfn = extent_start[i];
+
+ m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN, m2p_size, prot,
+ M2P_CHUNK_SIZE, entries, m2p_chunks);
+ if (m2p == NULL)
+ {
+ ERROR("xc_mmap_foreign_ranges failed");
+ goto err2;
+ }
+
+ *m2p_mfn0 = entries[0].mfn;
+
+ err2:
+ free(entries);
+ err1:
+ free(extent_start);
+ err0:
+ return m2p;
+}
+
+/* During transfer (or in the state file), all page-table pages must be
+ * converted into a 'canonical' form where references to actual mfns
+ * are replaced with references to the corresponding pfns.
+ *
+ * This function performs the appropriate conversion, taking into account
+ * which entries do not require canonicalisation (in particular, those
+ * entries which map the virtual address reserved for the hypervisor). */
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+ const void *spage, void *dpage,
+ xen_pfn_t *live_p2m_table,
+ xen_pfn_t *live_m2p_table, unsigned long
m2p_mfn0,
+ unsigned long p2m_size, unsigned long max_mfn,
+ unsigned long hvirt_start, unsigned int
pt_levels,
+ unsigned int guest_width)
+{
+ uint64_t pte;
+ int pte_last;
+ int xen_start;
+ int xen_end;
+ int i;
+ int race = 0;
+
+ /*
+ * We need to determine which entries in this page table hold
+ * reserved hypervisor mappings. This depends on the current
+ * page table type as well as the number of paging levels.
+ */
+ xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
+
+ if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
+
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+ xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+ /*
+ * In PAE only the L2 mapping the top 1GB contains Xen mappings.
+ * We can spot this by looking for the guest's mapping of the m2p.
+ * Guests must ensure that this check will fail for other L2s.
+ */
+ if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+ {
+ int hstart;
+ uint64_t he;
+
+ hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *)spage)[hstart];
+
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ {
+ /* hvirt starts with xen stuff... */
+ xen_start = hstart;
+ }
+ else if ( hvirt_start != 0xf5800000 )
+ {
+ /* old L2s from before hole was shrunk... */
+ hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *)spage)[hstart];
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+ xen_start = hstart;
+ }
+ }
+
+ if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ * XXX SMH: should compute these from hvirt_start (which we have)
+ * and hvirt_end (which we don't)
+ */
+ xen_start = 256;
+ xen_end = 272;
+ }
+
+ /* Now iterate through the page table, canonicalising each PTE */
+ for ( i = 0; i < pte_last; i++ )
+ {
+ unsigned long pfn;
+ unsigned long mfn;
+
+ if ( pt_levels == 2 )
+ pte = ((const uint32_t*)spage)[i];
+ else
+ pte = ((const uint64_t*)spage)[i];
+
+ if ( (i >= xen_start) && (i < xen_end) )
+ pte = 0;
+
+ if ( pte & _PAGE_PRESENT )
+ {
+ mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+ {
+ /*
+ * This will happen if the type info is stale which
+ * is quite feasible under live migration
+ */
+ pfn = 0; /* zap it - we'll retransmit this page later */
+ /*
+ * XXX: We can't spot Xen mappings in compat-mode L2es
+ * from 64-bit tools, but the only thing in them is the
+ * compat m2p, so we quietly zap them. This doesn't
+ * count as a race, so don't report it.
+ */
+ if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
+ && sizeof(unsigned long) > guest_width) )
+ race = 1; /* inform the caller; fatal if !live */
+ }
+ else
+ pfn = mfn_to_pfn(mfn);
+
+ pte &= ~MADDR_MASK_X86;
+ pte |= (uint64_t)pfn << PAGE_SHIFT;
+
+ /*
+ * PAE guest L3Es can contain these flags when running on
+ * a 64bit hypervisor. We zap these here to avoid any
+ * surprise at restore time...
+ */
+ if ( (pt_levels == 3)
+ && (type == XEN_DOMCTL_PFINFO_L3TAB)
+ && (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
+ pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
+ }
+
+ if ( pt_levels == 2 )
+ ((uint32_t*)dpage)[i] = pte;
+ else
+ ((uint64_t*)dpage)[i] = pte;
+ }
+
+ return race;
+}
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+ unsigned long p2m_size,
+ unsigned int guest_width)
+{
+ xc_dominfo_t info;
+ shared_info_t *live_shared_info = NULL;
+ xen_pfn_t *live_p2m_frame_list_list = NULL;
+ xen_pfn_t *live_p2m_frame_list = NULL;
+ xen_pfn_t *p2m_frame_list_list = NULL;
+ xen_pfn_t *p2m_frame_list = NULL;
+ xen_pfn_t *live_p2m_table = NULL;
+ int i;
+
+ /* Map the shared info frame */
+ if ( xc_domain_getinfo(xc_handle, domain_id, 1, &info) != 1 )
+ {
+ ERROR("could not get domain info");
+ goto out;
+ }
+
+ live_shared_info = xc_map_foreign_range(xc_handle, domain_id,
+ PAGE_SIZE, PROT_READ,
+ info.shared_info_frame);
+ if ( live_shared_info == NULL )
+ {
+ ERROR("could not map live shared info");
+ goto out;
+ }
+
+ /* Get the p2m frame list list */
+ live_p2m_frame_list_list =
+ xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE, PROT_READ,
+
live_shared_info->arch.pfn_to_mfn_frame_list_list);
+ if ( live_p2m_frame_list_list == NULL )
+ {
+ ERROR("could not map live p2m frame list list");
+ goto out;
+ }
+
+ /* Get a local copy of the live_P2M_frame_list_list */
+ p2m_frame_list_list = malloc(PAGE_SIZE);
+ if ( !p2m_frame_list_list )
+ {
+ ERROR("could not allocate p2m_frame_list_list array");
+ goto out;
+ }
+ memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
+
+ /* Canonicalise guest's unsigned long vs ours */
+ if ( guest_width > sizeof(unsigned long) )
+ for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
+ if ( i < PAGE_SIZE/guest_width )
+ p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
+ else
+ p2m_frame_list_list[i] = 0;
+ else if ( guest_width < sizeof(unsigned long) )
+ for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
+ p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
+
+ /* Get the p2m frame list */
+ live_p2m_frame_list = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ,
+ p2m_frame_list_list,
+ P2M_FLL_ENTRIES);
+ if ( live_p2m_frame_list == NULL )
+ {
+ ERROR("could not map live p2m frame list");
+ goto out;
+ }
+
+ /* Get a local copy of the live p2m frame_list */
+ p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE);
+ if ( !p2m_frame_list )
+ {
+ ERROR("could not allocate p2m frame list array");
+ goto out;
+ }
+ memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
+ memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
+
+ /* Canonicalise guest's unsigned long vs ours */
+ if ( guest_width > sizeof(unsigned long) )
+ for ( i = 0; i < P2M_FL_ENTRIES; i++ )
+ p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
+ else if ( guest_width < sizeof(unsigned long) )
+ for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
+ p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
+
+ /* Get the p2m table */
+ live_p2m_table = xc_map_foreign_batch(xc_handle, domain_id, PROT_READ,
+ p2m_frame_list,
+ P2M_FL_ENTRIES);
+ if ( live_p2m_table == NULL )
+ {
+ ERROR("could not map live p2m table");
+ goto out;
+ }
+
+ out:
+ if ( live_p2m_frame_list )
+ munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+
+ if ( live_p2m_frame_list_list )
+ munmap(live_p2m_frame_list_list, PAGE_SIZE);
+
+ if ( live_shared_info )
+ munmap(live_shared_info, PAGE_SIZE);
+
+ if ( p2m_frame_list )
+ free(p2m_frame_list);
+
+ if ( p2m_frame_list_list )
+ free(p2m_frame_list_list);
+
+ return live_p2m_table;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xc.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xc.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * tools/xencow/lib/xc.h
+ *
+ * libxc refactorisation. This should be put in libxc ultimately.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#define mfn_to_pfn(_mfn) (live_m2p_table[(_mfn)])
+
+#define pfn_to_mfn(_pfn) \
+ ((xen_pfn_t) (((guest_width)==8) \
+ ? (((uint64_t *)live_p2m_table)[(_pfn)]) \
+ : ((((uint32_t *)live_p2m_table)[(_pfn)]) == 0xffffffffU \
+ ? (-1UL) : (((uint32_t *)live_p2m_table)[(_pfn)]))))
+
+
+#if 0
+typedef struct xc_domain_st {
+ domid_t domain_id
+ xen_pfn_t *live_p2m_table;
+ xen_pfn_t *live_m2p_table;
+ unsigned long m2p_mfn0;
+ unsigned long p2m_size;
+ unsigned long max_mfn;
+ unsigned long hvirt_start;
+ unsigned int pt_levels;
+ unsigned int guest_width;
+} xc_domain_t;
+#endif
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+ unsigned long *m2p_mfn0);
+
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+ const void *spage, void *dpage,
+ xen_pfn_t *live_p2m_table,
+ xen_pfn_t *live_m2p_table, unsigned long
m2p_mfn0,
+ unsigned long p2m_size, unsigned long max_mfn,
+ unsigned long hvirt_start, unsigned int
pt_levels,
+ unsigned int guest_width);
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+ unsigned long p2m_size,
+ unsigned int guest_width);
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,1072 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow.c
+ *
+ * VM memory Copy-on-Write library.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+
+#include <pthread.h>
+#include <signal.h>
+
+#include <xen/domctl.h>
+
+#include <xc_private.h>
+#include <xg_save_restore.h>
+
+#include "xc.h"
+#include "xencow.h"
+
+
+static int xencow_create_file(const char *filename)
+{
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+ int flags = O_CREAT | O_TRUNC | O_RDWR;
+ int fd;
+
+ fd = open(filename, flags, mode);
+ if ( fd < 0 )
+ {
+ ERROR("Error opening file %s", filename);
+ return -EIO;
+ }
+ close(fd);
+
+ return 0;
+}
+
+/* Send an ioctl to the xencow device. */
+static int xencow_send_ioctl(int cmd, unsigned long arg)
+{
+ int fd;
+ int ret;
+
+ ret = -EIO;
+ fd = open("/dev/xencow0", O_RDWR);
+ if ( fd < 0 )
+ {
+ ERROR("Failed to open xencow device (/dev/xencow0)");
+ goto out;
+ }
+
+ ret = ioctl(fd, cmd, arg);
+ if ( ret != 0 )
+ ERROR("Error during ioctl of xencow device");
+
+ close(fd);
+
+ out:
+ return ret;
+}
+
+static void xencow_free(xencow_t *cow)
+{
+ munlock(cow->buffer, BUFFER_SIZE);
+ free(cow->buffer);
+ free(cow->mfns);
+
+ if ( cow->live_p2m_table )
+ free(cow->live_p2m_table);
+}
+
+static int xencow_alloc_bitmap(unsigned long **bitmap, unsigned long
bitmap_size)
+{
+ if ( *bitmap == NULL )
+ {
+ *bitmap = calloc(bitmap_size / BITS_PER_LONG, sizeof(unsigned long));
+ if ( *bitmap == NULL )
+ return -ENOMEM;
+ }
+
+ memset(*bitmap, 0, bitmap_size / 8);
+
+ return 0;
+}
+
+static void *xencow_handle_events(void *c)
+{
+ xencow_t *cow = (xencow_t *)c;
+
+ IPRINTF("Starting resume thread\n");
+
+ while (1)
+ {
+ int port = xencow_wait_for_event_or_timeout(cow, 10);
+
+ if ( port == cow->buffer_port || port == -1 )
+ {
+ if ( port == cow->buffer_port )
+ DPRINTF("Got buffer event\n");
+ xencow_flush_buffer(cow);
+ }
+ else if ( port == cow->pause_port )
+ {
+ /* If it was a pause event, flush buffer and resume domain */
+ int rc;
+
+ DPRINTF("Got pause event\n");
+
+ xencow_flush_buffer(cow);
+
+ rc = xencow_resume(cow);
+ if ( rc != 0 )
+ ERROR("Failed to resume domain");
+ }
+ else
+ ERROR("Unknown event");
+ }
+}
+
+static int xencow_start_thread(xencow_t *cow, void *(*__start_routine) (void
*))
+{
+ pthread_t thread;
+ sigset_t oldset;
+ sigset_t newset;
+ int ret;
+
+ sigemptyset(&newset);
+ sigaddset(&newset, SIGTERM);
+ sigaddset(&newset, SIGINT);
+ sigaddset(&newset, SIGHUP);
+ sigaddset(&newset, SIGQUIT);
+ pthread_sigmask(SIG_BLOCK, &newset, &oldset);
+
+ ret = pthread_create(&thread, NULL, __start_routine, cow);
+ if ( ret != 0 )
+ {
+ ERROR("Failed to create thread");
+ return -EIO;
+ }
+
+ pthread_detach(thread);
+ pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+
+ return 0;
+}
+
+static int xencow_init_buffer(xencow_t *cow)
+{
+ void *buffer;
+ cow_init_t *cow_init;
+ cow_request_t req;
+ RING_IDX req_prod;
+ int num_pages;
+ int i;
+ int ret;
+
+ DPRINTF("buffer size: %ld\n", BUFFER_SIZE);
+
+ /* Allocated page aligned buffer */
+ ret = posix_memalign(&buffer, PAGE_SIZE, BUFFER_SIZE);
+ if ( ret != 0 )
+ goto out_alloc;
+
+ /* Lock buffer in memory so it can't be paged out */
+ ret = mlock(buffer, BUFFER_SIZE);
+ if ( ret != 0 )
+ goto out_lock;
+
+ cow->buffer = buffer;
+ cow->page_buffer = buffer + XEN_COW_RING_SIZE;
+
+ /* Initialise ring */
+ SHARED_RING_INIT((cow_sring_t *)cow->buffer);
+ FRONT_RING_INIT(&cow->front_ring, (cow_sring_t *)cow->buffer,
XEN_COW_RING_SIZE);
+
+ num_pages = XEN_COW_RING_PAGES + RING_SIZE(&cow->front_ring);
+
+ DPRINTF("number of ring entries: %u\n", RING_SIZE(&cow->front_ring));
+
+ /* Allocate memory for ioctl struct */
+ ret = -ENOMEM;
+ cow_init = malloc(sizeof(cow_init_t) + (sizeof(unsigned long) *
num_pages));
+ if ( cow_init == NULL )
+ goto out_lock;
+
+ /* Initialise ioctl struct */
+ cow_init->addr = (unsigned long)(cow->buffer);
+ cow_init->num_mfns = num_pages;
+
+ /* Get MFNs */
+ ret = xencow_send_ioctl(XEN_COW_IOCTL_INIT, (unsigned long)cow_init);
+ if ( ret != 0 )
+ goto out;
+
+ /* Allocate memory for CoW struct */
+ cow->num_mfns = RING_SIZE(&cow->front_ring);
+ cow->mfns = calloc(cow->num_mfns, sizeof(unsigned long));
+
+ /* Copy MFNs */
+ cow->sring_mfn = cow_init->mfns[0];
+ memcpy(cow->mfns, &cow_init->mfns[XEN_COW_RING_PAGES],
+ sizeof(unsigned long) * cow->num_mfns);
+
+ /* Fill ring with page buffer MFNs */
+ req_prod = cow->front_ring.req_prod_pvt;
+ for ( i = 0; i < cow->num_mfns; i++ )
+ {
+ req.mfn = cow->mfns[i];
+ memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod + i), &req,
+ sizeof(cow_request_t));
+ }
+
+ cow->front_ring.req_prod_pvt = req_prod + i;
+ RING_PUSH_REQUESTS(&cow->front_ring);
+
+ free(cow_init);
+ return 0;
+
+ out:
+ free(cow_init);
+ out_init:
+ munlock(buffer, BUFFER_SIZE);
+ out_lock:
+ free(buffer);
+ out_alloc:
+ return ret;
+}
+
+static int xencow_init_xen(xencow_t *cow)
+{
+ /* Open connection to Xen */
+ cow->xc_handle = xc_interface_open();
+ if ( cow->xc_handle < 0 )
+ {
+ ERROR("Failed to connect to Xen");
+ goto err;
+ }
+
+ /* Open event channel */
+ cow->xce_handle = xc_evtchn_open();
+ if ( cow->xce_handle < 0 )
+ {
+ ERROR("Failed to open event channel");
+ goto err;
+ }
+
+ /* Bind VIRQ ports for event notification */
+ cow->buffer_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_BUFFER);
+ if ( cow->buffer_port < 0 )
+ {
+ ERROR("Failed to bind VIRQ");
+ goto err;
+ }
+
+ cow->pause_port = xc_evtchn_bind_virq(cow->xce_handle, VIRQ_COW_PAUSE);
+ if ( cow->pause_port < 0 )
+ {
+ ERROR("Failed to bind VIRQ");
+ goto err;
+ }
+
+ return 0;
+
+ err:
+ return -EINVAL;
+}
+
+static int xencow_init_domain_info(xencow_t *cow)
+{
+ xc_dominfo_t info;
+ int rc;
+
+ /* Get HVM info */
+ rc = xc_domain_getinfo(cow->xc_handle, cow->domain_id, 1, &info);
+ if ( rc != 1 )
+ {
+ ERROR("Failed to get domain info");
+ goto err;
+ }
+ cow->is_hvm = info.hvm;
+
+ /* Get memory size */
+ cow->p2m_size = xc_memory_op(cow->xc_handle, XENMEM_maximum_gpfn,
+ &cow->domain_id) + 1;
+
+ /* Get platform info */
+ rc = get_platform_info(cow->xc_handle, cow->domain_id,
+ &cow->platform_info.max_mfn,
+ &cow->platform_info.hvirt_start,
+ &cow->platform_info.pt_levels,
+ &cow->platform_info.guest_width);
+ if ( rc != 1 )
+ {
+ ERROR("Failed to get platform info");
+ goto err;
+ }
+
+ return 0;
+
+ err:
+ return -EINVAL;
+}
+
+xencow_t *xencow_init(domid_t domid)
+{
+ xencow_t *cow;
+ int rc;
+
+ /* Initialise CoW struct */
+ cow = malloc(sizeof(xencow_t));
+ if ( cow == NULL)
+ {
+ errno = ENOMEM;
+ goto cow_out;
+ }
+
+ memset(cow, 0, sizeof(xencow_t));
+
+ cow->domain_id = domid;
+ cow->xc_handle = -1;
+ cow->xce_handle = -1;
+
+ INIT_LIST_HEAD(&cow->snapshots);
+
+ /* Initialise locks */
+ cow_ring_lock_init(cow);
+ cow_snapshots_lock_init(cow);
+
+ /* Initialise buffer */
+ IPRINTF("Initialising buffer\n");
+ rc = xencow_init_buffer(cow);
+ if ( rc != 0 )
+ {
+ ERROR("Failed to initialise buffer");
+ goto out;
+ }
+
+ /* Initialise connection to Xen */
+ rc = xencow_init_xen(cow);
+ if ( rc != 0 )
+ {
+ ERROR("Failed to initialise connection to Xen");
+ goto out;
+ }
+
+ /* Get domain info */
+ rc = xencow_init_domain_info(cow);
+ if ( rc != 0 )
+ {
+ ERROR("Failed to get domain info");
+ goto out;
+ }
+
+ /* Start event handler thread */
+ xencow_start_thread(cow, xencow_handle_events);
+
+ return cow;
+
+ out:
+ xencow_free(cow);
+ cow_out:
+ return NULL;
+}
+
+static int xencow_open_snapshot_file_for_reading(xencow_snapshot_t *snapshot,
+ unsigned long pfn)
+{
+ int open_flags = O_RDONLY;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+ char *filename;
+
+ /* Open file */
+ if ( state_pfn(pfn) )
+ filename = snapshot->state_file;
+ else
+ filename = snapshot->backing_file;
+
+ return open(filename, open_flags, open_mode);
+}
+
+static int xencow_open_snapshot_file_for_writing(xencow_t *cow,
+ xencow_snapshot_t **snapshot,
+ RING_IDX now,
+ unsigned long pfn)
+{
+ int open_flags = O_RDWR;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP |
S_IWOTH;
+ char *filename;
+
+ /* Get the right snapshot to copy this page to */
+ list_for_each_entry_reverse ( (*snapshot), &cow->snapshots, list )
+ if ( (*snapshot)->when <= now )
+ break;
+
+ /* Open file */
+ if ( state_pfn(pfn) )
+ filename = (*snapshot)->state_file;
+ else
+ filename = (*snapshot)->backing_file;
+
+ return open(filename, open_flags, open_mode);
+}
+
+static int xencow_read_page(int fd, unsigned long pfn, void *buffer_page)
+{
+ off_t offset;
+ off_t seek_ret;
+ int total_read;
+ int ret;
+
+ offset = pfn_offset(pfn);
+
+ seek_ret = lseek64(fd, offset, SEEK_SET);
+#if 0
+ if ( ret < 0 )
+ {
+ ERROR("Error seeking: %ld (%lx)", (long)offset, offset_pfn(offset));
+ ret = -errno;
+ goto err;
+ }
+#endif
+
+ total_read = 0;
+ while ( total_read < PAGE_SIZE )
+ {
+ void *p = buffer_page + total_read;
+ int bytes_read = read(fd, p, PAGE_SIZE - total_read);
+ if ( bytes_read <= 0 )
+ {
+ ret = -errno;
+ goto err;
+ }
+
+ total_read += bytes_read;
+ }
+
+ return 0;
+
+ err:
+ ERROR("Read error");
+ return ret;
+}
+
+static int xencow_read_live_page(xencow_t *cow, unsigned long pfn, void
*buffer_page)
+{
+ unsigned long mfn;
+ int ret;
+
+ /* Get MFN */
+ mfn = xencow_p2m(cow, pfn);
+
+ /* Check if MFN is mapped */
+ if ( is_mapped(mfn) )
+ {
+ void *page = xc_map_foreign_batch(cow->xc_handle, cow->domain_id,
PROT_READ, &mfn, 1);
+ int copy_frame = 0;
+
+ if ( cow->is_hvm )
+ {
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB
)
+ copy_frame = 1;
+ }
+ else
+ {
+ ((uint32_t *)(&mfn))[0] = mfn;
+
+ ret = xc_get_pfn_type_batch(cow->xc_handle, cow->domain_id, 1,
(uint32_t *)(&mfn));
+ if ( ret != 0 )
+ {
+ ERROR("get_pfn_type_batch failed");
+ goto err;
+ }
+ mfn = (uint32_t)mfn;
+
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) != XEN_DOMCTL_PFINFO_XTAB
)
+ {
+ /* Canonicalise mfn -> pfn */
+ mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+ copy_frame = 1;
+ }
+ }
+
+ if ( copy_frame )
+ {
+ /* Copy live page */
+ if ( page != NULL )
+ memcpy(buffer_page, page, PAGE_SIZE);
+ else
+ memset(buffer_page, 0, PAGE_SIZE);
+ }
+ else
+ /* Copy blank page */
+ memset(buffer_page, 0, PAGE_SIZE);
+
+ munmap(page, PAGE_SIZE);
+ }
+ else
+ /* Copy blank page */
+ memset(buffer_page, 0, PAGE_SIZE);
+
+ return 0;
+
+ err:
+ return ret;
+}
+
+static int xencow_read_snapshot_page(xencow_snapshot_t *snapshot,
+ unsigned long pfn,
+ void *buffer_page)
+{
+ int ret;
+
+ if ( test_bit(pfn, snapshot->bitmap) )
+ {
+ /* Open file for reading */
+ int fd = xencow_open_snapshot_file_for_reading(snapshot, pfn);
+ if ( fd < 0 )
+ {
+ ERROR("Error opening file");
+ ret = -errno;
+ goto out;
+ }
+
+ /* Read file and close */
+ ret = xencow_read_page(fd, pfn, buffer_page);
+ close(fd);
+ }
+ else
+ ret = -ENOENT;
+
+ out:
+ return ret;
+}
+
+int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long
start_pfn,
+ int num_pages, void *buffer)
+{
+ int open_flags = O_RDONLY;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+ xencow_snapshot_t *snapshot;
+ int current_num;
+ int fd;
+ int i;
+ int ret;
+
+ cow_snapshots_lock(cow);
+
+ current_num = 0;
+ list_for_each_entry ( snapshot, &cow->snapshots, list )
+ {
+ if ( current_num == snapshot_num )
+ break;
+ current_num++;
+ }
+
+ /* Open file */
+ fd = xencow_open_snapshot_file_for_reading(snapshot, start_pfn);
+ if ( fd < 0 )
+ {
+ ret = -errno;
+ goto out_open;
+ }
+
+ /* Read pages */
+ for ( i = 0; i < num_pages; i++ )
+ {
+ void *buffer_page = buffer + (i * PAGE_SIZE);
+ unsigned long pfn = start_pfn + i;
+
+ /* Check bitmap for page */
+ if ( test_bit(pfn, snapshot->bitmap) )
+ {
+ ret = xencow_read_page(fd, pfn, buffer_page);
+ if ( ret != 0 )
+ goto out;
+ }
+ else
+ {
+ int found = 0;
+
+ /* Check later snapshots */
+ list_for_each_entry_continue ( snapshot, &cow->snapshots, list )
+ {
+ ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page);
+ if ( ret == 0 )
+ {
+ found = 1;
+ break;
+ }
+ }
+
+ if ( !found )
+ {
+ /* If not found, read page from live domain */
+ ret = xencow_read_live_page(cow, pfn, buffer_page);
+ if ( ret != 0 )
+ goto out;
+
+ /* Flush buffer */
+ cow_snapshots_unlock(cow);
+ xencow_flush_buffer(cow);
+ cow_snapshots_lock(cow);
+
+ /* Check (latest) bitmap for page again */
+ snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t,
list);
+
+ ret = xencow_read_snapshot_page(snapshot, pfn, buffer_page);
+ if ( ret == 0 )
+ DPRINTF("Page dirtied since read from live\n");
+ }
+ }
+ }
+
+ ret = 0;
+
+ out:
+ close(fd);
+ out_open:
+ cow_snapshots_unlock(cow);
+ return ret;
+}
+
+static int xencow_write_page(int fd, off_t offset, void *page)
+{
+ int total_written;
+ int ret;
+ off_t seek_ret;
+
+ seek_ret = lseek64(fd, offset, SEEK_SET);
+#if 0
+ if ( ret < 0 )
+ {
+ ERROR("Error seeking: %ld (%lx)\n", (long)offset, offset_pfn(offset));
+ ret = -errno;
+ goto out;
+ }
+#endif
+
+ /* Write page */
+ total_written = 0;
+ while ( total_written < PAGE_SIZE )
+ {
+ void *p = page + total_written;
+ int bytes_written = write(fd, p, PAGE_SIZE - total_written);
+
+ DPRINTF("Writing first chunk: %lx\n", *((unsigned long *)p));
+
+ if ( bytes_written <= 0 )
+ {
+ ERROR("Error writing");
+ ret = -errno;
+ goto out;
+ }
+
+ total_written += bytes_written;
+
+ DPRINTF("Wrote %d bytes\n", bytes_written);
+ }
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static int xencow_flush_page(xencow_t *cow, xencow_snapshot_t *snapshot,
+ RING_IDX now, int fd, unsigned long pfn,
+ void *page)
+{
+ off_t offset;
+ int ret;
+
+ if ( state_pfn(pfn) )
+ offset = pfn_offset(now - snapshot->when);
+ else if ( !test_and_set_bit(pfn, snapshot->bitmap) )
+ offset = pfn_offset(pfn);
+ else
+ return -1;
+
+ /* Write to file */
+ ret = xencow_write_page(fd, offset, page);
+ if ( ret != 0 && !state_pfn(pfn) )
+ clear_bit(pfn, snapshot->bitmap);
+
+ return ret;
+}
+
+#define BATCH_REQS 1
+
+void xencow_flush_buffer(xencow_t *cow)
+{
+ cow_request_t req;
+ RING_IDX req_prod;
+ RING_IDX rsp_prod;
+ RING_IDX i;
+ xencow_snapshot_t *snapshot = NULL;
+ char *filename;
+ int fd = -1;
+ int fd_is_state = 0;
+
+ if ( cow->next_snapshot == 0 )
+ return;
+
+ cow_ring_lock(cow);
+
+ rsp_prod = cow->front_ring.sring->rsp_prod;
+ req_prod = cow->front_ring.sring->req_prod;
+
+ /* Flush buffer pages */
+ for ( i = cow->front_ring.rsp_cons; i != rsp_prod; i++ )
+ {
+ cow_response_t rsp;
+ void *page = cow->page_buffer
+ + (RING_MASK(&cow->front_ring, i) << PAGE_SHIFT);
+
+ memcpy(&rsp, RING_GET_RESPONSE(&cow->front_ring, i),
+ sizeof(cow_response_t));
+
+ DPRINTF("num: %lx; pfn: %lx; page first chunk: %lx\n",
+ (unsigned long)i, rsp.pfn, *((unsigned long *)page));
+
+ /* Open appropriate file */
+ cow_snapshots_lock(cow);
+
+ if ( (fd < 0) ||
+ (state_pfn(rsp.pfn) && !fd_is_state) ||
+ (!state_pfn(rsp.pfn) && fd_is_state))
+ {
+ if ( fd >= 0 )
+ close(fd);
+
+ fd = xencow_open_snapshot_file_for_writing(cow, &snapshot, i,
rsp.pfn);
+ if ( fd < 0 )
+ {
+ ERROR("Error opening file");
+ return;
+ }
+
+ fd_is_state = state_pfn(rsp.pfn);
+ if ( fd_is_state )
+ DPRINTF("state page: %d\n", i);
+ else
+ DPRINTF("normal page: %d\n", i);
+ }
+
+ cow_snapshots_unlock(cow);
+
+ /* Flush buffer page */
+ xencow_flush_page(cow, snapshot, i, fd, rsp.pfn, page);
+
+#if !BATCH_REQS
+ cow->front_ring.rsp_cons = i + 1;
+ cow->front_ring.sring->rsp_event = i + 2;
+#endif
+
+ /* Put buffer page MFN in ring */
+ req.mfn = cow->mfns[RING_MASK(&cow->front_ring, i)];
+ memcpy(RING_GET_REQUEST(&cow->front_ring, req_prod), &req,
+ sizeof(cow_request_t));
+ req_prod++;
+
+#if !BATCH_REQS
+ /* Push added MFN out */
+ cow->front_ring.req_prod_pvt = req_prod;
+ RING_PUSH_REQUESTS(&cow->front_ring);
+#endif
+ }
+
+ if ( fd >= 0 )
+ close(fd);
+
+#if BATCH_REQS
+ cow->front_ring.rsp_cons = i;
+ cow->front_ring.sring->rsp_event = i + 1;
+
+ /* Push added MFNs out */
+ cow->front_ring.req_prod_pvt = req_prod;
+ RING_PUSH_REQUESTS(&cow->front_ring);
+#endif
+
+ cow_ring_unlock(cow);
+}
+
+int xencow_resume(xencow_t *cow)
+{
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_cow_resume;
+ domctl.domain = cow->domain_id;
+
+ return do_domctl(cow->xc_handle, &domctl);
+}
+
+int xencow_enable(xencow_t *cow)
+{
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_cow_enable;
+ domctl.domain = cow->domain_id;
+ domctl.u.cow_enable.mfn = cow->sring_mfn;
+
+ return do_domctl(cow->xc_handle, &domctl);
+}
+
+int xencow_disable(xencow_t *cow)
+{
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_cow_disable;
+ domctl.domain = cow->domain_id;
+
+ return do_domctl(cow->xc_handle, &domctl);
+}
+
+unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn)
+{
+ unsigned long mnf;
+ unsigned long *live_p2m_table;
+
+ if ( cow->is_hvm )
+ return pfn;
+
+ if ( cow->live_p2m_table == NULL )
+ cow->live_p2m_table =
+ xc_get_live_p2m_table(cow->xc_handle, cow->domain_id,
cow->p2m_size,
+ cow->platform_info.guest_width);
+
+ return cow->live_p2m_table[pfn];
+}
+
+static int xencow_create_snapshot_files(xencow_t *cow, xencow_snapshot_t
*snapshot)
+{
+ char *fuse_file = malloc(8 * sizeof(int) + 4);
+ char *backing_file = malloc(200);
+ char *state_file = malloc(200);
+ int ret;
+
+ /* FIXME: Don't hardcode the path */
+ /* Get file names */
+ sprintf(fuse_file, "%d.%d", cow->domain_id, cow->next_snapshot);
+ sprintf(backing_file, "/tmp/xencow%s", fuse_file);
+ sprintf(state_file, "%s.state", backing_file);
+
+ DPRINTF("fuse: %s; backing: %s; state: %s\n", fuse_file, backing_file,
state_file);
+
+ /* Create backing files */
+ ret = xencow_create_file(backing_file);
+ if ( ret != 0 )
+ return ret;
+
+ ret = xencow_create_file(state_file);
+ if ( ret != 0 )
+ return ret;
+
+ /* Store file names */
+ snapshot->xencowfs_file = malloc(strlen(fuse_file) + 1);
+ strncpy(snapshot->xencowfs_file, fuse_file, strlen(fuse_file) + 1);
+
+ snapshot->backing_file = malloc(strlen(backing_file) + 1);
+ strncpy(snapshot->backing_file, backing_file, strlen(backing_file) + 1);
+
+ snapshot->state_file = malloc(strlen(state_file) + 1);
+ strncpy(snapshot->state_file, state_file, strlen(state_file) + 1);
+
+ return 0;
+}
+
+static int xencow_init_snapshot(xencow_t *cow)
+{
+ xencow_snapshot_t *snapshot = malloc(sizeof(xencow_snapshot_t));
+ int ret;
+
+ memset(snapshot, 0, sizeof(xencow_snapshot_t));
+
+ ret = xencow_alloc_bitmap(&snapshot->bitmap, cow->p2m_size);
+ if ( ret != 0 )
+ {
+ ERROR("Error allocating bitmap");
+ return ret;
+ }
+
+ ret = xencow_create_snapshot_files(cow, snapshot);
+ if ( ret != 0 )
+ {
+ ERROR("Error creating backing files");
+ return ret;
+ }
+
+ list_add_tail(&snapshot->list, &cow->snapshots);
+
+ return 0;
+}
+
+static int xencow_take_snapshot(xencow_t *cow)
+{
+ struct timeval before;
+ struct timeval after;
+ double time_diff;
+ xencow_snapshot_t *snapshot;
+ int ret;
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_cow_snapshot;
+ domctl.domain = cow->domain_id;
+
+ gettimeofday(&before, NULL);
+
+ ret = do_domctl(cow->xc_handle, &domctl);
+ if ( ret != 0 )
+ {
+ ERROR("Error taking snapshot");
+ return ret;
+ }
+
+ gettimeofday(&after, NULL);
+
+ time_diff = difftime(after.tv_usec, before.tv_usec);
+ IPRINTF("Time spent paused: %fus\n", time_diff);
+
+ ret = xencow_init_snapshot(cow);
+ if ( ret != 0 )
+ {
+ ERROR("Error initialising snapshot");
+ return ret;
+ }
+
+ snapshot = list_bottom(&cow->snapshots, xencow_snapshot_t, list);
+ snapshot->when = domctl.u.cow_snapshot.when;
+
+ cow->next_snapshot++;
+
+ DPRINTF("when = %d\n", snapshot->when);
+
+ return ret;
+}
+
+void xencow_cleanup(xencow_t *cow)
+{
+ /* Disable CoW */
+ xencow_disable(cow);
+
+ /* Reset variables */
+ cow->p2m_size = 0;
+ cow->num_mfns = 0;
+ cow->sring_mfn = 0;
+
+ /* Close event channel */
+ xc_evtchn_close(cow->xce_handle);
+ cow->xce_handle = -1;
+
+ /* Close connection to Xen */
+ xc_interface_close(cow->xc_handle);
+ cow->xc_handle = -1;
+
+ /* Free memory */
+ xencow_free(cow);
+}
+
+int xencow_snapshot(xencow_t *cow)
+{
+ int ret = -1;
+
+ cow_snapshots_lock(cow);
+ ret = xencow_take_snapshot(cow);
+ cow_snapshots_unlock(cow);
+
+ if ( ret != 0 )
+ {
+ ERROR("Error taking snapshot");
+ return ret;
+ }
+
+ return 0;
+}
+
+int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms)
+{
+ struct pollfd fd = { .fd = cow->xce_handle, .events = POLLIN | POLLERR };
+ int port;
+ int rc;
+
+ rc = poll(&fd, 1, ms);
+ if ( rc == -1 )
+ {
+ if (errno == EINTR)
+ return 0;
+ ERROR("Poll exited with an error");
+ return -2;
+ }
+
+ if ( rc == 1 )
+ {
+ port = xc_evtchn_pending(cow->xce_handle);
+ if ( port == -1 )
+ {
+ ERROR("Failed to read port from event channel");
+ return -2;
+ }
+
+ rc = xc_evtchn_unmask(cow->xce_handle, port);
+ if ( rc == -1 )
+ {
+ ERROR("Failed to unmask event channel port");
+ return -2;
+ }
+ }
+ else
+ port = -1;
+
+ return port;
+}
+
+int xencow_wait_for_event(xencow_t *cow)
+{
+ return xencow_wait_for_event_or_timeout(cow, -1);
+}
+
+int xencow_page_type(xencow_t *cow, unsigned long pfn,
+ unsigned long *count_info, unsigned long *type_info)
+{
+ unsigned long mfn;
+ int ret;
+ DECLARE_DOMCTL;
+
+ mfn = xencow_p2m(cow, pfn);
+
+ domctl.cmd = XEN_DOMCTL_cow_page_type;
+ domctl.domain = cow->domain_id;
+ domctl.u.cow_page_type.mfn = mfn;
+
+ ret = do_domctl(cow->xc_handle, &domctl);
+ if ( ret != 0 )
+ {
+ *count_info = domctl.u.cow_page_type.count_info;
+ *type_info = domctl.u.cow_page_type.type_info;
+ }
+
+ return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow.h
+ *
+ * VM memory Copy-on-Write library.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#ifndef __XEN_COW_H__
+#define __XEN_COW_H__
+
+
+#include <inttypes.h>
+#include <xen/xen.h>
+#include <xen/io/cow.h>
+#include <xen/event_channel.h>
+#include <xen/domctl.h>
+#include <xenctrl.h>
+#include <xc_private.h>
+#include "xencow_list.h"
+
+
+#define STATE_MFN ((unsigned long)(-1))
+
+#define state_pfn(_pfn) ((_pfn) == STATE_MFN)
+
+#define BUFFER_SIZE \
+ ((((PAGE_SIZE >> 1) / sizeof(unsigned long)) + XEN_COW_RING_PAGES) \
+ << PAGE_SHIFT)
+
+
+#define offset_pfn(_offset) ((_offset) >> PAGE_SHIFT)
+
+#define pfn_offset(_pfn) (((off_t)(_pfn)) << PAGE_SHIFT)
+
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+
+/* XXX: stolen from xen/asm/bitops.h */
+/* XXX: should these be in here? are they required to work externally? */
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX "lock ; "
+#else
+#define LOCK_PREFIX ""
+#endif
+
+#define ADDR (*(volatile long *) addr)
+#define CONST_ADDR (*(const volatile long *) addr)
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile void *addr)
+{
+ asm volatile (
+ LOCK_PREFIX
+ "btrl %1,%0"
+ : "=m" (ADDR)
+ : "Ir" (nr), "m" (ADDR) : "memory");
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile void *addr)
+{
+ int oldbit;
+
+ asm volatile (
+ LOCK_PREFIX
+ "btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "=m" (ADDR)
+ : "Ir" (nr), "m" (ADDR) : "memory");
+ return oldbit;
+}
+
+static inline int test_bit(int nr, const volatile void *addr)
+{
+ int oldbit;
+
+ asm volatile (
+ "btl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit)
+ : "m" (CONST_ADDR), "Ir" (nr) : "memory" );
+ return oldbit;
+}
+
+static inline int testandset (int *p)
+{
+ long int readval = 0;
+
+ __asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+ : "+m" (*p), "+a" (readval)
+ : "r" (1)
+ : "cc");
+ return readval;
+}
+
+
+/* Spin lock */
+typedef int spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED 0
+
+static inline void spin_lock(spinlock_t *lock)
+{
+// while ( test_and_set_bit(1, lock) );
+ while ( testandset(lock) );
+}
+
+static inline void spin_lock_init(spinlock_t *lock)
+{
+ *lock = SPIN_LOCK_UNLOCKED;
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+ *lock = SPIN_LOCK_UNLOCKED;
+}
+
+static inline int spin_trylock(spinlock_t *lock)
+{
+ return !testandset(lock);
+}
+
+/* CoW ring lock */
+#define cow_ring_lock_init(_c) spin_lock_init(&(_c)->ring_lock)
+#define cow_ring_lock(_c) spin_lock(&(_c)->ring_lock)
+#define cow_ring_unlock(_c) spin_unlock(&(_c)->ring_lock)
+
+/* CoW snapshots list */
+#define cow_snapshots_lock_init(_c) spin_lock_init(&(_c)->snapshots_lock)
+#define cow_snapshots_lock(_c) spin_lock(&(_c)->snapshots_lock)
+#define cow_snapshots_unlock(_c) spin_unlock(&(_c)->snapshots_lock)
+
+
+typedef struct xencow_snapshot_st {
+ struct list_head list;
+
+ /* bitmap of PFNs that have been saved */
+ unsigned long *bitmap;
+
+ /* when the snapshot was taken */
+ RING_IDX when;
+
+ /* files for snapshot image */
+ char *xencowfs_file;
+ char *state_file;
+ char *backing_file;
+} xencow_snapshot_t;
+
+typedef struct platform_info_st {
+ unsigned long max_mfn;
+ unsigned long hvirt_start;
+ unsigned int pt_levels;
+ unsigned int guest_width;
+} platform_info_t;
+
+typedef struct xencow_st {
+ domid_t domain_id;
+ int is_hvm;
+ unsigned long p2m_size;
+ unsigned long *live_p2m_table;
+
+ int xc_handle;
+ int xce_handle;
+
+ evtchn_port_t buffer_port;
+ evtchn_port_t pause_port;
+
+ platform_info_t platform_info;
+
+ size_t buffer_size;
+ void *buffer;
+
+ int num_mfns;
+ unsigned long *mfns;
+
+ unsigned long sring_mfn;
+ cow_front_ring_t front_ring;
+
+ spinlock_t ring_lock;
+
+ void *page_buffer;
+
+ struct list_head snapshots;
+ unsigned int next_snapshot;
+
+ spinlock_t snapshots_lock;
+} xencow_t;
+
+
+xen_pfn_t *xc_map_m2p(int xc_handle, unsigned long max_mfn, int prot,
+ unsigned long *m2p_mfn0);
+
+int xc_canonicalise_pagetable(unsigned long type, unsigned long pfn,
+ const void *spage, void *dpage,
+ xen_pfn_t *live_p2m_table,
+ xen_pfn_t *live_m2p_table, unsigned long
m2p_mfn0,
+ unsigned long p2m_size, unsigned long max_mfn,
+ unsigned long hvirt_start, unsigned int
pt_levels,
+ unsigned int guest_width);
+
+xen_pfn_t *xc_get_live_p2m_table(int xc_handle, domid_t domain_id,
+ unsigned long p2m_size,
+ unsigned int guest_width);
+
+
+/* Initialise CoW for a domain */
+xencow_t *xencow_init(domid_t domid);
+
+/* Enable CoW */
+int xencow_enable(xencow_t *cow);
+
+/* Disable CoW */
+int xencow_disable(xencow_t *cow);
+
+/* Take a snapshot */
+int xencow_snapshot(xencow_t *cow);
+
+/* Resume a domain paused because of CoW */
+int xencow_resume(xencow_t *cow);
+
+/* Get the MFN for a PFN */
+unsigned long xencow_p2m(xencow_t *cow, unsigned long pfn);
+
+/* Cleanup a CoW struct */
+void xencow_cleanup(xencow_t *cow);
+
+/* Wait for an event */
+int xencow_wait_for_event(xencow_t *cow);
+int xencow_wait_for_event_or_timeout(xencow_t *cow, unsigned long ms);
+
+/* Flush the pre-dirtied page buffer */
+void xencow_flush_buffer(xencow_t *cow);
+
+/* Read pages from the pre-dirtied buffer */
+int xencow_read_buffer(xencow_t *cow, int snapshot_num, unsigned long
start_pfn,
+ int num_pages, void *buffer);
+
+/* Get info for a page */
+int xencow_page_type(xencow_t *cow, unsigned long pfn,
+ unsigned long *count_info, unsigned long *type_info);
+
+
+#endif /* __XEN_COW_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/lib/xencow_list.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/lib/xencow_list.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,555 @@
+/******************************************************************************
+ * tools/xencow/lib/xencow_list.h
+ *
+ * Linked list.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#ifndef __XEN_COW_LIST_H__
+#define __XEN_COW_LIST_H__
+
+
+/* Taken from Linux kernel code, but de-kernelized for userspace. */
+#include <stddef.h>
+
+#undef LIST_HEAD_INIT
+#undef LIST_HEAD
+#undef INIT_LIST_HEAD
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+#define container_of(ptr, type, member) ({ \
+ typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+#define list_top(head, type, member)
\
+({
\
+ struct list_head *_head = (head);
\
+ list_empty(_head) ? NULL : list_entry(_head->next, type, member); \
+})
+
+#define list_bottom(head, type, member) \
+({ \
+ struct list_head *_head = (head); \
+ list_empty(_head) ? NULL : list_entry(_head->prev, type, member); \
+})
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_add_rcu(struct list_head * new,
+ struct list_head * prev,
+ struct list_head * next)
+{
+ new->next = next;
+ new->prev = prev;
+ next->prev = new;
+ prev->next = new;
+}
+
+/**
+ * list_add_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline__ void list_add_rcu(struct list_head *new, struct list_head
*head)
+{
+ __list_add_rcu(new, head, head->next);
+}
+
+/**
+ * list_add_tail_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline__ void list_add_tail_rcu(struct list_head *new, struct
list_head *head)
+{
+ __list_add_rcu(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * Note: list_empty on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the list.
+ */
+static inline void list_del_rcu(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+ struct list_head *head)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+ struct list_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr: the &struct list_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+/**
+ * list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev - iterate over a list backwards
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+ for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of
list entry
+ * @pos: the &struct list_head to use as a loop counter.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry - iterate over list of given type
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+
+/**
+ * list_for_each_entry_continue - iterate over list of given type
+ * continuing after existing point
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against
removal of list entry
+ * @pos: the type * to use as a loop counter.
+ * @n: another type * to use as temporary storage
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct hlist_node {
+ struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
+
+static __inline__ int hlist_unhashed(struct hlist_node *h)
+{
+ return !h->pprev;
+}
+
+static __inline__ int hlist_empty(struct hlist_head *h)
+{
+ return !h->first;
+}
+
+static __inline__ void __hlist_del(struct hlist_node *n)
+{
+ struct hlist_node *next = n->next;
+ struct hlist_node **pprev = n->pprev;
+ *pprev = next;
+ if (next)
+ next->pprev = pprev;
+}
+
+static __inline__ void hlist_del(struct hlist_node *n)
+{
+ __hlist_del(n);
+ n->next = LIST_POISON1;
+ n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_del_rcu - deletes entry from hash list without re-initialization
+ * @entry: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ */
+static inline void hlist_del_rcu(struct hlist_node *n)
+{
+ __hlist_del(n);
+ n->pprev = LIST_POISON2;
+}
+
+static __inline__ void hlist_del_init(struct hlist_node *n)
+{
+ if (n->pprev) {
+ __hlist_del(n);
+ INIT_HLIST_NODE(n);
+ }
+}
+
+#define hlist_del_rcu_init hlist_del_init
+
+static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head
*h)
+{
+ struct hlist_node *first = h->first;
+ n->next = first;
+ if (first)
+ first->pprev = &n->next;
+ h->first = n;
+ n->pprev = &h->first;
+}
+
+static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct
hlist_head *h)
+{
+ struct hlist_node *first = h->first;
+ n->next = first;
+ n->pprev = &h->first;
+ if (first)
+ first->pprev = &n->next;
+ h->first = n;
+}
+
+/* next must be != NULL */
+static __inline__ void hlist_add_before(struct hlist_node *n, struct
hlist_node *next)
+{
+ n->pprev = next->pprev;
+ n->next = next;
+ next->pprev = &n->next;
+ *(n->pprev) = n;
+}
+
+static __inline__ void hlist_add_after(struct hlist_node *n,
+ struct hlist_node *next)
+{
+ next->next = n->next;
+ *(next->pprev) = n;
+ n->next = next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+/* Cannot easily do prefetch unfortunately */
+#define hlist_for_each(pos, head) \
+ for (pos = (head)->first; pos; pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+ for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
+ pos = n)
+
+/**
+ * hlist_for_each_entry - iterate over list of given type
+ * @tpos: the type * to use as a loop counter.
+ * @pos: the &struct hlist_node to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member) \
+ for (pos = (head)->first; \
+ pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after
existing point
+ * @tpos: the type * to use as a loop counter.
+ * @pos: the &struct hlist_node to use as a loop counter.
+ * @member: the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member) \
+ for (pos = (pos)->next; \
+ pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from existing
point
+ * @tpos: the type * to use as a loop counter.
+ * @pos: the &struct hlist_node to use as a loop counter.
+ * @member: the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member) \
+ for (; pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against
removal of list entry
+ * @tpos: the type * to use as a loop counter.
+ * @pos: the &struct hlist_node to use as a loop counter.
+ * @n: another &struct hlist_node to use as temporary storage
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
+ for (pos = (head)->first; \
+ pos && ({ n = pos->next; 1; }) && \
+ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = n)
+
+
+#endif /* __XEN_COW_LIST_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/test/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/test/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,32 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS += -I $(XEN_XC)
+CFLAGS += $(CFLAGS_libxenctrl)
+
+SRCS += cow_compare.c
+
+CFLAGS += -Werror
+CFLAGS += -g
+CFLAGS += -Wl,-rpath,..
+
+LDFLAGS += $(LDFLAGS_libxenctrl) -lxencow
+
+OBJS = $(SRCS:.c=.o)
+IBINS = cow_compare
+
+all: $(IBINS)
+
+cow_compare: $(OBJS)
+ $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+install: all
+ $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+ $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR)
+
+clean:
+ rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB)
+
+.PHONY: clean install
+
+-include $(DEPS)
diff -r 0477f9061c8a tools/xencow/test/cow_compare.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/test/cow_compare.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,896 @@
+/******************************************************************************
+ * tools/xencow/test/cow_compare.c
+ *
+ * Test application to compare CoW iamge and live memory dumps
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <string.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#if 0
+#include <xen/xen.h>
+#include <xc_private.h>
+#endif
+#include <xg_private.h>
+#include <xg_save_restore.h>
+#include <xenctrl.h>
+
+#include "../lib/xc.h"
+#include "../lib/xencow.h"
+
+
+#define COW 1
+
+#if COW
+#define COW_FILE1 "vm_dump.cow1"
+#define COW_FILE2 "vm_dump.cow2"
+#endif
+#define LIVE_FILE1 "vm_dump.live1"
+#define LIVE_FILE2 "vm_dump.live2"
+
+
+#define SLEEP_TIME (10 * 1) /* 1 min */
+
+
+/* Printing functions */
+#if 1
+#define pr_debug(_f, _a...) \
+ printf("%s(): " _f, __func__, ##_a)
+#else
+#define pr_debug(_f, _a...) ((void)0)
+#endif
+
+#define warning(_f, _a...) \
+ fprintf(stderr, "%s(): " _f, __func__, ##_a)
+
+
+static int debug_mode = 0;
+
+#if 0
+typedef struct page_info_st {
+ unsigned long count_info;
+ unsigned long type_info;
+} page_info_t;
+
+page_info_t get_page_type_info(int xc_handle, domid_t domain_id, xen_pfn_t mfn)
+{
+ page_info_t page_info;
+ int rc;
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_cow_page_type;
+ domctl.domain = domain_id;
+ domctl.u.cow_page_type.mfn = mfn;
+
+ rc = do_domctl(xc_handle, &domctl);
+ if ( rc != 0 )
+ {
+ printf("error getting page type for %lx\n", mfn);
+ page_info.count_info = 0;
+ page_info.type_info = 0;
+ return page_info;
+ }
+
+ page_info.count_info = domctl.u.cow_page_type.count_info;
+ page_info.type_info = domctl.u.cow_page_type.type_info;
+
+ return page_info;
+}
+#endif
+
+int compare_pages(void *page1, void *page2, uint32_t page_size)
+{
+ uint32_t i;
+ int rc = 0;
+
+ for ( i = 0; i < page_size; i++ )
+ {
+ if ( ((char *)page1)[i] != ((char *)page2)[i] )
+ {
+ rc--;
+#if VERBOSE
+ printf("images do not match at offset %x (%u): ", i, i);
+ printf("(%x) (%x)\n", ((char *)page1)[i], ((char *)page2)[i]);
+#endif
+ }
+ }
+
+ return rc;
+}
+
+int compare(char *file1, char *file2, int xc_handle, domid_t domain_id,
+ xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table,
+ unsigned long m2p_mfn0,
+ unsigned long p2m_size, unsigned long max_mfn,
+ unsigned long hvirt_start, unsigned int pt_levels,
+ unsigned int guest_width)
+{
+ int open_flags = O_RDONLY;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH;
+ int fd1;
+ int fd2;
+ void *page1 = malloc(PAGE_SIZE);
+ void *page2 = malloc(PAGE_SIZE);
+ unsigned long pfn;
+ int hvm = 0;
+ int rc;
+ int ret = 0;
+
+ if ( !live_p2m_table )
+ hvm = 1;
+
+ /* Open image files */
+ fd1 = open(file1, open_flags, open_mode);
+ if ( fd1 < 0 )
+ {
+ perror("failed to open file1");
+ return -1;
+ }
+
+ fd2 = open(file2, open_flags, open_mode);
+ if ( fd2 < 0 )
+ {
+ perror("failed to open file2");
+ return -1;
+ }
+
+ /* Read images */
+ pfn = 0;
+ while ( pfn < p2m_size )
+ {
+ /* Read pages */
+ off64_t offset = pfn_offset(pfn);
+ off64_t ret_seek;
+ int total_read;
+
+ ret_seek = lseek64(fd1, offset, SEEK_SET);
+#if 0
+ if ( ret_fd1 < 0 )
+ {
+ perror("failed to seek file1");
+ return -1;
+ }
+#endif
+
+ ret_seek = lseek64(fd2, offset, SEEK_SET);
+#if 0
+ if ( ret_fd2 < 0 )
+ {
+ perror("failed to seek file2");
+ return -1;
+ }
+#endif
+
+ total_read = 0;
+ while ( total_read < PAGE_SIZE )
+ {
+ void *p = page1 + total_read;
+ int bytes_read = read(fd1, p, PAGE_SIZE - total_read);
+ if ( bytes_read <= 0 )
+ {
+ perror("failed to read from file1");
+ return -1;
+ }
+ total_read += bytes_read;
+ }
+
+ total_read = 0;
+ while ( total_read < PAGE_SIZE )
+ {
+ void *p = page2 + total_read;
+ int bytes_read = read(fd2, p, PAGE_SIZE - total_read);
+ if ( bytes_read <= 0 )
+ {
+ perror("failed to read from file2");
+ return -1;
+ }
+ total_read += bytes_read;
+ }
+
+ rc = compare_pages(page1, page2, PAGE_SIZE);
+
+ /* Check if the pages are different */
+ if ( rc != 0 )
+ {
+ xen_pfn_t mfn;
+ int check_frame = 0;
+ int nonhypervisor_bytes = 0;
+
+ /* Get MFN */
+ if ( hvm )
+ mfn = pfn;
+ else
+ mfn = pfn_to_mfn(pfn);
+
+#if 1
+ if ( is_mapped(mfn) )
+ {
+#endif
+ if ( hvm )
+ {
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
XEN_DOMCTL_PFINFO_XTAB )
+ check_frame = 1;
+ }
+ else
+ {
+ ((uint32_t *)(&mfn))[0] = mfn;
+ mfn = (uint32_t)mfn;
+
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
XEN_DOMCTL_PFINFO_XTAB )
+ {
+ /* Canonicalise mfn -> pfn */
+ mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+ check_frame = 1;
+ }
+ }
+
+ if ( check_frame )
+ {
+ unsigned long addr;
+ unsigned long type;
+
+ addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ type = mfn & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ /* Check if the page is present */
+ if ( type != XEN_DOMCTL_PFINFO_XTAB )
+ {
+ type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (type <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ int pte_last;
+ int xen_start;
+ int xen_end;
+ int i;
+
+ printf("page table page: %lx\n", pfn);
+
+ /*
+ * We need to determine which entries in this page
table hold
+ * reserved hypervisor mappings. This depends on
the current
+ * page table type as well as the number of paging
levels.
+ */
+ xen_start = xen_end = pte_last = PAGE_SIZE /
((pt_levels == 2) ? 4 : 8);
+
+ if ( (pt_levels == 2) && (type ==
XEN_DOMCTL_PFINFO_L2TAB) )
+ xen_start = (hvirt_start >>
L2_PAGETABLE_SHIFT);
+
+ if ( (pt_levels == 3) && (type ==
XEN_DOMCTL_PFINFO_L3TAB) )
+ xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+ /*
+ * In PAE only the L2 mapping the top 1GB contains
Xen mappings.
+ * We can spot this by looking for the guest's
mapping of the m2p.
+ * Guests must ensure that this check will fail
for other L2s.
+ */
+ if ( (pt_levels == 3) && (type ==
XEN_DOMCTL_PFINFO_L2TAB) )
+ {
+ int hstart;
+ uint64_t he;
+
+ hstart = (hvirt_start >>
L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *)page1)[hstart];
+
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) ==
m2p_mfn0 )
+ {
+ /* hvirt starts with xen stuff... */
+ xen_start = hstart;
+ }
+ else if ( hvirt_start != 0xf5800000 )
+ {
+ /* old L2s from before hole was shrunk...
*/
+ hstart = (0xf5800000 >>
L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+ he = ((const uint64_t *)page1)[hstart];
+ if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86)
== m2p_mfn0 )
+ xen_start = hstart;
+ }
+ }
+
+ if ( (pt_levels == 4) && (type ==
XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ * XXX SMH: should compute these from
hvirt_start (which we have)
+ * and hvirt_end (which we don't)
+ */
+ xen_start = 256;
+ xen_end = 272;
+ }
+
+ /*
+ * Scan for changed bytes that aren't reserved by
+ * the hypervisor
+ */
+ for ( i = 0; i < pte_last; i++ )
+ if ( ((char *)page1)[i] != ((char *)page2)[i] )
+ if ( !((i >= xen_start) && (i < xen_end)) )
+ nonhypervisor_bytes++;
+ }
+
+ switch (type)
+ {
+
+ case XEN_DOMCTL_PFINFO_NOTAB:
+ {
+ printf(" normal page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_L1TAB:
+ {
+ printf(" l1 table page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_L2TAB:
+ {
+ printf(" l2 table page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_L3TAB:
+ {
+ printf(" l3 table page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_L4TAB:
+ {
+ printf(" l4 table page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_LPINTAB:
+ {
+ printf(" pin page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ case XEN_DOMCTL_PFINFO_XTAB:
+ {
+ printf(" invalid page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+ break;
+
+ default:
+ printf(" unknown page: %lx: %d %d\n", pfn, -rc,
nonhypervisor_bytes);
+ }
+
+ }
+ }
+#if 1
+ }
+#endif
+
+ if ( debug_mode )
+ printf("images do not match at page %lx (%lx): %d (%d) bytes
different\n",
+ pfn, mfn, -rc, nonhypervisor_bytes);
+
+ if ( nonhypervisor_bytes + rc != 0)
+ ret--;
+ }
+
+ /* Move to next page */
+ pfn++;
+ }
+
+ close(fd1);
+ close(fd2);
+
+ return ret;
+}
+
+int dump_memory(char *filename, int xc_handle, domid_t domain_id,
+ xen_pfn_t *live_p2m_table, xen_pfn_t *live_m2p_table,
+ unsigned long m2p_mfn0,
+ unsigned long p2m_size, unsigned long max_mfn,
+ unsigned long hvirt_start, unsigned int pt_levels,
+ unsigned int guest_width)
+{
+ int open_flags = O_CREAT | O_TRUNC | O_RDWR;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP |
S_IWOTH;
+ int fd;
+ unsigned long pfn;
+ void *page;
+ int hvm = 0;
+#if 1
+ int rc;
+#endif
+
+ if ( !live_p2m_table )
+ hvm = 1;
+
+ /* Open file */
+ fd = open(filename, open_flags, open_mode);
+ if ( fd < 0 )
+ {
+ perror("failed to open file");
+ return -1;
+ }
+
+ /* Write out memory contents */
+ pfn = 0;
+ while ( pfn < p2m_size )
+ {
+ size_t bytes_written;
+ xen_pfn_t mfn;
+ int copy_frame = 0;
+ int pt_page = 0;
+
+ page = NULL;
+
+ if ( hvm )
+ mfn = pfn;
+ else
+ mfn = pfn_to_mfn(pfn);
+
+ /* Read page */
+ if ( is_mapped(mfn) )
+ {
+ if ( hvm )
+ {
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
XEN_DOMCTL_PFINFO_XTAB )
+ {
+ page = xc_map_foreign_batch(xc_handle, domain_id,
PROT_READ, &mfn, 1);
+ copy_frame = 1;
+ }
+ }
+ else
+ {
+ page = xc_map_foreign_range(xc_handle, domain_id, PAGE_SIZE,
PROT_READ, mfn);
+
+ ((uint32_t *)(&mfn))[0] = mfn;
+
+ rc = xc_get_pfn_type_batch(xc_handle, domain_id, 1, (uint32_t
*)(&mfn));
+ if ( rc )
+ {
+ ERROR("get_pfn_type_batch failed");
+ goto out;
+ }
+ mfn = (uint32_t)mfn;
+
+ if ( (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
XEN_DOMCTL_PFINFO_XTAB )
+ {
+ /* Canonicalise mfn -> pfn */
+ mfn = (mfn & XEN_DOMCTL_PFINFO_LTAB_MASK) | pfn;
+ copy_frame = 1;
+ }
+ else
+ {
+ munmap(page, PAGE_SIZE);
+ page = NULL;
+ }
+ }
+
+#if 0
+ if ( copy_frame )
+ {
+ unsigned long addr;
+ unsigned long type;
+
+ addr = mfn & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ type = mfn & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ /* Check if the page is present */
+ if ( type != XEN_DOMCTL_PFINFO_XTAB )
+ {
+ type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (type >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (type <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ int race;
+ void *dpage = malloc(PAGE_SIZE);
+
+ race = xc_canonicalise_pagetable(type, addr, page,
dpage,
+ live_p2m_table,
+ live_m2p_table,
+ m2p_mfn0,
+ p2m_size, max_mfn,
+ hvirt_start,
pt_levels,
+ guest_width);
+
+ munmap(page, PAGE_SIZE);
+ page = dpage;
+ pt_page = 1;
+ }
+ }
+ else
+ {
+ munmap(page, PAGE_SIZE);
+ copy_frame = 0;
+ }
+ }
+#endif
+ if ( copy_frame )
+ {
+ off64_t seek_ret;
+ int total_written = 0;
+
+ seek_ret = lseek64(fd, pfn_offset(pfn), SEEK_SET);
+
+ while ( total_written < PAGE_SIZE )
+ {
+ void *p = page + total_written;
+ bytes_written = write(fd, p, PAGE_SIZE - total_written);
+
+ if ( bytes_written <= 0 )
+ {
+ perror("failed to write to file");
+ return -1;
+ }
+
+ total_written += bytes_written;
+ }
+
+ /* Free or unmap page if needed */
+ if ( pt_page )
+ free(page);
+ else
+ munmap(page, PAGE_SIZE);
+ }
+ }
+
+ pfn++;
+ }
+
+#if 1
+ out:
+#endif
+ close(fd);
+
+ return 0;
+}
+
+#if COW
+int dump_cow(char *filename, unsigned long p2m_size, char *cow_file)
+{
+ int open_flags = O_CREAT | O_TRUNC | O_RDWR;
+ mode_t open_mode = S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR | S_IWGRP |
S_IWOTH;
+ int fd_cow;
+ int fd_dump;
+ unsigned long pfn;
+ void *page = malloc(PAGE_SIZE);
+
+ /* Open files */
+ fd_cow = open(cow_file, O_RDONLY, open_mode);
+ if ( fd_cow < 0 )
+ {
+ perror("failed to open cow file");
+ return -1;
+ }
+
+ fd_dump = open(filename, open_flags, open_mode);
+ if ( fd_dump < 0 )
+ {
+ perror("failed to open dump file");
+ return -1;
+ }
+
+ /* Write out memory contents */
+ pfn = 0;
+ while ( pfn < p2m_size )
+ {
+ off64_t seek_ret;
+ int total_read;
+ int total_written;
+
+ /* Read page */
+ seek_ret = lseek64(fd_cow, pfn_offset(pfn), SEEK_SET);
+#if 0
+ if ( ret < 0 )
+ {
+ perror("failed to seek cow file");
+ return -1;
+ }
+#endif
+
+ total_read = 0;
+ while ( total_read < PAGE_SIZE )
+ {
+ void *p = page + total_read;
+ int bytes_read = read(fd_cow, p, PAGE_SIZE - total_read);
+ if ( bytes_read <= 0 )
+ {
+ perror("failed to read cow file");
+ return -1;
+ }
+
+ total_read += bytes_read;
+ }
+
+ /* Write memory contents to file */
+ seek_ret = lseek64(fd_dump, pfn_offset(pfn), SEEK_SET);
+#if 0
+ if ( ret < 0 )
+ {
+ perror("failed to seek cow file");
+ return -1;
+ }
+#endif
+
+ total_written = 0;
+ while ( total_written < PAGE_SIZE )
+ {
+ void *p = page + total_written;
+ int bytes_written = write(fd_dump, p, PAGE_SIZE - total_written);
+ if ( bytes_written <= 0 )
+ {
+ perror("failed to write dump file");
+ return -1;
+ }
+
+ total_written += bytes_written;
+ }
+
+ pfn++;
+ }
+
+ close(fd_dump);
+ close(fd_cow);
+
+ return 0;
+}
+#endif
+
+int main(int argc, char *argv[])
+{
+ xc_dominfo_t info;
+ domid_t domain_id;
+ int snapshot_num;
+ int xc_handle;
+ xen_pfn_t *live_m2p_table;
+ xen_pfn_t *live_p2m_table;
+ unsigned long m2p_mfn0;
+ unsigned long p2m_size;
+ unsigned long max_mfn;
+ unsigned long hvirt_start;
+ unsigned int pt_levels;
+ unsigned int guest_width;
+#if 1
+#if COW
+ char fuse_file[200];
+#endif
+#endif
+ int rc;
+
+ domain_id = atoi(argv[1]);
+ /* TODO: find this automatically */
+ snapshot_num = atoi(argv[2]);
+
+ if ( argc > 3 )
+ {
+ if ( strcmp(argv[3], "-d") == 0 )
+ debug_mode = 1;
+ }
+
+ /* Open connection to Xen */
+ rc = xc_interface_open();
+ if ( rc < 0 )
+ {
+ warning("failed to connect to Xen\n");
+ goto out;
+ }
+ xc_handle = rc;
+
+ /* Get some info */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &domain_id) + 1;
+
+ rc = get_platform_info(xc_handle, domain_id, &max_mfn, &hvirt_start,
+ &pt_levels, &guest_width);
+ if ( rc != 1 )
+ {
+ warning("failed to get platform info\n");
+ goto out;
+ }
+
+ /* Get HVM info */
+ rc = xc_domain_getinfo(xc_handle, domain_id, 1, &info);
+ if ( rc != 1 )
+ {
+ warning("failed to get domain info\n");
+ goto out;
+ }
+
+ /* Print info */
+ if ( debug_mode )
+ {
+ printf("p2m_size: %lu\n", p2m_size);
+ printf("max_mfn: %lx (%lu)\n", max_mfn, max_mfn);
+ printf("hvirt_start: %lx (%lu)\n", hvirt_start, hvirt_start);
+ printf("pt_levels: %x (%u)\n", pt_levels, pt_levels);
+ printf("guest_width: %x (%u)\n", guest_width, guest_width);
+ printf("shared_info_frame: %lx (%lu)\n", info.shared_info_frame,
info.shared_info_frame);
+ }
+
+ /* Setup the ofn to mfn table mapping */
+ if ( info.hvm )
+ {
+ if ( debug_mode )
+ printf("HVM guest\n");
+
+ live_p2m_table = NULL;
+ }
+ else
+ {
+ if ( debug_mode )
+ printf("PV guest\n");
+
+ /* Get live p2m table */
+ live_p2m_table = xc_get_live_p2m_table(xc_handle, domain_id, p2m_size,
+ guest_width);
+ if ( !live_p2m_table )
+ {
+ warning("failed to get live p2m table\n");
+ goto out;
+ }
+ }
+
+ /* Setup the mfn to pfn table mapping */
+ live_m2p_table = xc_map_m2p(xc_handle, max_mfn, PROT_READ, &m2p_mfn0);
+ if ( !live_m2p_table )
+ {
+ warning("failed to map live m2p table\n");
+ goto out;
+ }
+
+ /* Pause domain */
+ printf("Pausing domain\n");
+ rc = xc_domain_pause(xc_handle, domain_id);
+ if ( rc != 0 )
+ {
+ warning("failed to pause domain");
+ goto out;
+ }
+ sleep(1);
+
+#if 1
+#if COW
+ /* Take snapshot */
+ printf("Taking snapshot\n");
+ rc = system("touch /tmp/foo/1");
+ sleep(1);
+
+ /* Dump CoW image */
+ printf("Dumping CoW image (1)... ");
+ fflush(stdout);
+ sprintf(fuse_file, "/tmp/foo/%d.%d", domain_id, snapshot_num);
+ dump_cow(COW_FILE1, p2m_size, fuse_file);
+ printf("done\n");
+#endif
+#endif
+
+ /* Dump live VM image */
+ printf("Dumping live VM image (1)... ");
+ fflush(stdout);
+ dump_memory(LIVE_FILE1, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ printf("done\n");
+
+ /* Dump another live VM image */
+ printf("Dumping live VM image (2)... ");
+ fflush(stdout);
+ dump_memory(LIVE_FILE2, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ printf("done\n");
+
+ /* Unpause domain */
+ printf("Unpausing domain\n");
+ rc = xc_domain_unpause(xc_handle, domain_id);
+ if ( rc != 0 )
+ {
+ warning("failed to unpause domain");
+ goto out;
+ }
+
+#if 1
+#if COW
+ /* Let domain run for a bit */
+ printf("Sleeping for %d seconds\n", SLEEP_TIME);
+ sleep(SLEEP_TIME);
+
+ /* Dump CoW image */
+ printf("Dumping CoW image (2)... ");
+ fflush(stdout);
+ sprintf(fuse_file, "/tmp/foo/%d.0", domain_id);
+ dump_cow(COW_FILE2, p2m_size, fuse_file);
+ printf("done\n");
+#endif
+#endif
+
+ /* Compare images*/
+ printf("-- Comparing images --\n");
+
+ printf("Comparing live1 live2...\n");
+ rc = compare(LIVE_FILE1, LIVE_FILE2, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+
+#if COW
+ printf("Comparing cow1 cow2...\n");
+ rc = compare(COW_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+
+ printf("Comparing live1 cow1...\n");
+ rc = compare(LIVE_FILE1, COW_FILE1, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+
+ printf("Comparing live1 cow2...\n");
+// rc = compare(LIVE_FILE1, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle,
domain_id, live_p2m_table);
+ rc = compare(LIVE_FILE1, COW_FILE2, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+
+ printf("Comparing live2 cow1...\n");
+// rc = compare(LIVE_FILE2, COW_FILE1, PAGE_SIZE, p2m_size, xc_handle,
domain_id, live_p2m_table);
+ rc = compare(LIVE_FILE2, COW_FILE1, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+
+ printf("Comparing live2 cow2...\n");
+// rc = compare(LIVE_FILE2, COW_FILE2, PAGE_SIZE, p2m_size, xc_handle,
domain_id, live_p2m_table);
+ rc = compare(LIVE_FILE2, COW_FILE2, xc_handle, domain_id, live_p2m_table,
live_m2p_table,
+ m2p_mfn0, p2m_size, max_mfn, hvirt_start, pt_levels,
guest_width);
+ if ( rc != 0 )
+ printf("Images do not match (%d pages different)\n", -rc);
+ else
+ printf("Images match\n");
+#endif
+
+ out:
+ return 0;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a tools/xencow/xencowfs/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,35 @@
+XEN_ROOT=../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS += -I $(XEN_XC)
+CFLAGS += -I ../lib
+CFLAGS += $(CFLAGS_libxenctrl)
+
+SRCS += xencowfs.c
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -D_FILE_OFFSET_BITS=64
+CFLAGS += -g
+CFLAGS += -Wl,-rpath,..
+
+LDFLAGS += $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) -L../lib -lxencow
-lfuse -lpthread
+
+OBJS = $(SRCS:.c=.o)
+IBINS = xencowfs
+
+all: $(IBINS)
+
+xencowfs: $(OBJS)
+ $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+install: all
+ $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+ $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(SBINDIR)
+
+clean:
+ rm -f *.o *~ $(DEPS) xen TAGS $(IBINS) $(LIB)
+
+.PHONY: clean install
+
+-include $(DEPS)
diff -r 0477f9061c8a tools/xencow/xencowfs/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/README Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,24 @@
+
+
+First, launch the desired target domain. Create a mount point for the xencow
FUSE module to use and from the tools/xencow/xencowfs directory, run:
+
+sudo ./xencowfs <mount point> <domid>
+
+This will initialise CoW for the domain. To take a snapshot, simply poke the
FUSE mount point (e.g. touch <mount point>/1). The file name doesn't matter as
FUSE will create it's own file named:
+
+<mount point>/<domid>.<snapshot>
+
+e.g. xencow/1.0, xencow/1.1
+
+Currently, the backing files are hardcoded to appear in /tmp with the
following names:
+
+/tmp/xencow<domid>.<snapshot>
+/tmp/xencow<domid>.<snapshot>.state
+
+The <domid>.<snapshot> pair corresponds to the FUSE file. The .state file
contains state pages (e.g. CPU registers), while the other file contains the
pre-dirtied pages for that domain.
+
+It is possible to use XenAccess in file mode to access the snapshot image. A
slightly modified version of the memory-dump example from XenAccess 0.5 is
included which is designed to work on the CoW image. To use it, run (I've only
tried running it from the xenaccess-0.5/examples/):
+
+sudo dump-memory-cow <FUSE image file> <output file>
+
+This will create a complete memory image of the running domain at the time the
snapshot was taken.
\ No newline at end of file
diff -r 0477f9061c8a tools/xencow/xencowfs/xencowfs.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xencow/xencowfs/xencowfs.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,253 @@
+/******************************************************************************
+ * tools/xencow/xencowfs/xencowfs.c
+ *
+ * VM memory Copy-on-Write FUSE module
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#define FUSE_USE_VERSION 26
+
+
+#include <fuse.h>
+#include <string.h>
+#include <errno.h>
+//#include <xc_private.h>
+
+#include "../lib/xencow.h"
+
+
+#define DEBUG_OUTPUT 0
+
+
+static xencow_t *cow;
+
+static inline int get_snapshot_num(const char *path)
+{
+ return atoi(strrchr(path, '.'));
+}
+
+static int path_exists(const char *path)
+{
+ xencow_snapshot_t *snapshot;
+
+ list_for_each_entry ( snapshot, &cow->snapshots, list )
+ if ( (strcmp(snapshot->xencowfs_file, path) == 0)
+ || ((path[0] == '/')
+ && (strcmp(snapshot->xencowfs_file, path + 1) == 0)) )
+ return 1;
+
+ return 0;
+}
+
+static int xencowfs_create(const char *path, mode_t mode,
+ struct fuse_file_info *fi)
+{
+ return xencow_snapshot(cow);
+}
+
+static void xencowfs_destroy(void *data)
+{
+ (void) data;
+
+ xencow_disable(cow);
+}
+
+static int xencowfs_getattr(const char *path, struct stat *stat)
+{
+ int res = 0;
+
+ memset(stat, 0, sizeof(struct stat));
+
+ if ( strcmp(path, "/") == 0 )
+ {
+ stat->st_mode = S_IFDIR | 0755;
+ stat->st_nlink = 2;
+ }
+ else if ( path_exists(path) )
+ {
+ stat->st_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH;
+ stat->st_nlink = 1;
+ stat->st_size = cow->p2m_size << PAGE_SHIFT;
+ stat->st_blksize = PAGE_SIZE;
+ stat->st_blocks = cow->p2m_size;
+ }
+ else
+ res = -ENOENT;
+
+ return res;
+}
+
+static void *xencowfs_init(struct fuse_conn_info *conn)
+{
+ return NULL;
+}
+
+static int xencowfs_open(const char *path, struct fuse_file_info *fi)
+{
+ if ( !path_exists(path) )
+ return -ENOENT;
+
+ if ( (fi->flags & 3) != O_RDONLY )
+ return -EACCES;
+
+ return 0;
+}
+
+static int xencowfs_read(const char *path, char *buffer, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ int snapshot_num;
+ unsigned long start_pfn;
+ int num_pages;
+ int ret;
+
+ (void) fi;
+
+ /* TODO: Worry about offsets not page aligned */
+ start_pfn = offset_pfn(offset);
+ num_pages = size >> PAGE_SHIFT;
+
+ /* Page align check */
+ if ( pfn_offset(start_pfn) != offset )
+ ERROR("Offset not page aligned!");
+
+ /* Check that it doesn't read past the end of the domain's memory */
+ if ( start_pfn + num_pages > cow->p2m_size )
+ num_pages = cow->p2m_size - start_pfn;
+
+ IPRINTF("size = %lx; offset = %lx; num_pages = %d\n",
+ (unsigned long)size, (unsigned long)offset, num_pages);
+
+ /* Get the appropriate snapshot */
+ snapshot_num = get_snapshot_num(path);
+
+ ret = xencow_read_buffer(cow, snapshot_num, start_pfn, num_pages, buffer);
+ if ( ret != 0 )
+ goto out;
+
+ ret = size;
+
+ out:
+ return ret;
+}
+
+static int xencowfs_readdir(const char *path, void *buffer,
+ fuse_fill_dir_t filler, off_t offset,
+ struct fuse_file_info *fi)
+{
+ xencow_snapshot_t *snapshot;
+
+ (void) offset;
+ (void) fi;
+
+ if ( strcmp(path, "/") != 0)
+ return -ENOENT;
+
+ filler(buffer, ".", NULL, 0);
+ filler(buffer, "..", NULL, 0);
+
+ list_for_each_entry(snapshot, &cow->snapshots, list)
+ filler(buffer, snapshot->xencowfs_file, NULL, 0);
+
+ return 0;
+}
+
+static int xencowfs_statfs(const char *path, struct statvfs *buf)
+{
+ (void) path;
+
+ buf->f_bsize = PAGE_SIZE;
+ buf->f_blocks = cow->p2m_size;
+ buf->f_bfree = 0;
+ buf->f_bavail = 0;
+ buf->f_files = 0;
+ buf->f_ffree = 0;
+ buf->f_fsid = 0;
+ buf->f_namemax = 255;
+ buf->f_favail = 0;
+ buf->f_frsize = buf->f_blocks;
+ buf->f_flag = 0;
+
+ return 0;
+}
+
+static struct fuse_operations xencowfs_oper = {
+ .create = xencowfs_create,
+ .destroy = xencowfs_destroy,
+ .getattr = xencowfs_getattr,
+ .init = xencowfs_init,
+ .open = xencowfs_open,
+ .read = xencowfs_read,
+ .readdir = xencowfs_readdir,
+ .statfs = xencowfs_statfs,
+};
+
+int main(int argc, char *argv[])
+{
+ domid_t domid;
+ int rc;
+
+ IPRINTF("Start\n");
+
+ /* The last arg is the domain number */
+ rc = -EINVAL;
+ domid = atoi(argv[argc - 1]);
+ if ( domid == 0 )
+ {
+ ERROR("Invalid domain");
+ exit(rc);
+ }
+
+ /* Initialise CoW */
+ IPRINTF("Initialise CoW\n");
+ rc = -ENOMEM;
+ cow = xencow_init(domid);
+ if ( cow == NULL )
+ {
+ ERROR("Could not initialise CoW");
+ exit(rc);
+ }
+
+ /* Enable CoW */
+ IPRINTF("Enable CoW\n");
+ rc = xencow_enable(cow);
+ if ( rc != 0 )
+ {
+ ERROR("Could not enable CoW: rc = %d", rc);
+ exit(rc);
+ }
+ IPRINTF("CoW enabled\n");
+
+ rc = fuse_main(argc - 1, argv, &xencowfs_oper, NULL);
+ if ( rc != 0 )
+ xencow_disable(cow);
+
+ return rc;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/domctl.c Mon Apr 20 10:21:49 2009 -0700
@@ -28,6 +28,7 @@
#include <asm/processor.h>
#include <xsm/xsm.h>
#include <xen/iommu.h>
+#include <asm/cow.h>
long arch_do_domctl(
struct xen_domctl *domctl,
@@ -1087,6 +1088,132 @@
}
break;
+ /* TODO: replace with XEN_DOMCTL_cow_op */
+ case XEN_DOMCTL_cow_enable:
+ {
+ struct domain *d;
+ void *ring_page;
+
+ ret = -EINVAL;
+ if ( domctl->domain == current->domain->domain_id )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ /* FIXME: Some other error code? */
+ ret = -EINVAL;
+ ring_page = map_domain_page_global(domctl->u.cow_enable.mfn);
+ if ( ring_page == NULL )
+ goto cow_enable_out;
+
+ BACK_RING_INIT(&d->arch.paging.cow.back_ring, (cow_sring_t
*)ring_page, PAGE_SIZE);
+
+ ret = 0;
+
+ cow_enable_out:
+ printk("CoW: enabled: ret: %ld\n", ret);
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_cow_snapshot:
+ {
+ struct domain *d;
+ RING_IDX when;
+
+ ret = -EINVAL;
+ if ( domctl->domain == current->domain->domain_id )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ ret = cow_snapshot(d, &when);
+ if ( ret != 0 )
+ goto cow_snapshot_out;
+
+ domctl->u.cow_snapshot.when = when;
+ ret = 0;
+
+ if ( copy_to_guest(u_domctl, domctl, 1) )
+ ret = -EFAULT;
+
+ cow_snapshot_out:
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_cow_resume:
+ {
+ struct domain *d;
+
+
+ ret = -EINVAL;
+ if ( domctl->domain == current->domain->domain_id )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ ret = cow_resume(d);
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_cow_disable:
+ {
+ struct domain *d;
+
+ ret = -EINVAL;
+ if ( domctl->domain == current->domain->domain_id )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ ret = paging_log_dirty_disable(d);
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_cow_page_type:
+ {
+ struct domain *d;
+ struct page_info *page;
+
+ ret = -EINVAL;
+ if ( domctl->domain == current->domain->domain_id )
+ break;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ page = mfn_to_page(domctl->u.cow_page_type.mfn);
+
+ domctl->u.cow_page_type.count_info = page->count_info;
+ domctl->u.cow_page_type.type_info = page->u.inuse.type_info;
+ ret = 0;
+
+ if ( copy_to_guest(u_domctl, domctl, 1) )
+ ret = -EFAULT;
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r 0477f9061c8a xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/hvm/hvm.c Mon Apr 20 10:21:49 2009 -0700
@@ -1542,8 +1542,8 @@
}
else
{
+ paging_mark_dirty(curr->domain, mfn);
memcpy(p, buf, count);
- paging_mark_dirty(curr->domain, mfn);
}
}
else
diff -r 0477f9061c8a xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/Makefile Mon Apr 20 10:21:49 2009 -0700
@@ -6,6 +6,7 @@
obj-y += guest_walk_2.o
obj-y += guest_walk_3.o
obj-$(x86_64) += guest_walk_4.o
+obj-y += cow.o
guest_walk_%.o: guest_walk.c Makefile
$(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff -r 0477f9061c8a xen/arch/x86/mm/cow.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/cow.c Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,832 @@
+/******************************************************************************
+ * arch/x86/mm/cow.c
+ *
+ * CoW paging support
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ * Parts based on earlier work by Geoffrey Lefebvre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include <asm/cow.h>
+#include <asm/paging.h>
+#include <xen/event.h>
+
+
+#define COW_DOMAIN_PAUSE 0
+#define COW_DEBUG_OUTPUT 0
+
+
+/* Printouts */
+#define PAGING_PRINTK(_f, _a...) \
+ debugtrace_printk("pg: %s(): " _f, __func__, ##_a)
+#define PAGING_ERROR(_f, _a...) \
+ printk("pg error: %s(): " _f, __func__, ##_a)
+#if COW_DEBUG_OUTPUT
+#define PAGING_DEBUG(flag, _f, _a...) \
+ do { \
+ if (PAGING_DEBUG_ ## flag) \
+ printk("pgdebug: %s(): " _f, __func__, ##_a); \
+ } while (0)
+#else
+#define PAGING_DEBUG(flag, _f, _a...) \
+ do { \
+ if (PAGING_DEBUG_ ## flag) \
+ debugtrace_printk("pgdebug: %s(): " _f, __func__, ##_a); \
+ } while (0)
+#endif
+
+
+#define STATE_MFN ((unsigned long)(-1))
+
+#define xen_mb() mb()
+#define xen_rmb() rmb()
+#define xen_wmb() wmb()
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_to_page
+#define mfn_to_page(_m) (frame_table + mfn_x(_m))
+#undef mfn_valid
+#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#undef page_to_mfn
+#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+
+/* The CoW lock. This protects the log-dirty bitmap from concurrent accesses
+ * (and teardowns, etc).
+ *
+ * Locking discipline: always acquire log dirty lock before this one. */
+
+#define cow_lock_init(_d) \
+ do { \
+ spin_lock_init(&(_d)->arch.paging.cow.lock); \
+ (_d)->arch.paging.cow.locker = -1; \
+ (_d)->arch.paging.cow.locker_function = "nobody"; \
+ } while (0)
+
+#define cow_lock(_d) \
+ do { \
+ if (unlikely((_d)->arch.paging.cow.locker==current->processor)) \
+ { \
+ printk("Error: paging cow lock held by %s\n", \
+ (_d)->arch.paging.cow.locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_d)->arch.paging.cow.lock); \
+ ASSERT((_d)->arch.paging.cow.locker == -1); \
+ (_d)->arch.paging.cow.locker = current->processor; \
+ (_d)->arch.paging.cow.locker_function = __func__; \
+ } while (0)
+
+#define cow_unlock(_d) \
+ do { \
+ ASSERT((_d)->arch.paging.cow.locker == current->processor); \
+ (_d)->arch.paging.cow.locker = -1; \
+ (_d)->arch.paging.cow.locker_function = "nobody"; \
+ spin_unlock(&(_d)->arch.paging.cow.lock); \
+ } while (0)
+
+
+/* XXX: ugly cut and paste from common/grant_table.c */
+#define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry))
+#define active_entry(t, e)
((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
+
+
+static void cow_notify_dom0_pause(unsigned long unused)
+{
+ printk("cow: notifying dom0 that domain is paused\n");
+ send_guest_global_virq(dom0, VIRQ_COW_PAUSE);
+}
+static DECLARE_TASKLET(cow_notify_dom0_pause_tasklet, cow_notify_dom0_pause,
0);
+
+static void cow_notify_dom0_high_water(unsigned long flag_addr)
+{
+ printk("cow: notifying dom0 that ring buffer passed high water mark\n");
+ send_guest_global_virq(dom0, VIRQ_COW_BUFFER);
+ (*(bool_t *)flag_addr) = 0;
+}
+static DECLARE_TASKLET(cow_notify_dom0_high_water_tasklet,
cow_notify_dom0_high_water, 0);
+
+static void paging_free_cow_bitmap(unsigned long **bitmap)
+{
+ if ( likely(*bitmap != NULL) )
+ {
+ printk("cow: freeing bitmap\n");
+ xfree(*bitmap);
+ *bitmap = NULL;
+ }
+}
+
+static void paging_free_cow(struct domain *d)
+{
+ printk("cow: freeing bitmaps\n");
+ paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap);
+ paging_free_cow_bitmap(&d->arch.paging.cow.bitmap);
+}
+
+static int paging_alloc_cow_bitmap(unsigned long **bitmap,
+ unsigned long bitmap_size)
+{
+ BUG_ON(bitmap_size == 0);
+
+ if ( unlikely(*bitmap == NULL) )
+ {
+ *bitmap = xmalloc_array(unsigned long, bitmap_size / BITS_PER_LONG);
+
+ if ( unlikely(*bitmap == NULL) )
+ return -ENOMEM;
+ }
+
+ memset(*bitmap, 0, bitmap_size / 8);
+
+ return 0;
+}
+
+/* Get address of current buffer page for a given domain */
+static unsigned long cow_get_buffer_page(struct domain *d)
+{
+ cow_request_t req;
+ cow_back_ring_t *back_ring;
+ RING_IDX req_cons;
+
+ cow_ring_lock(d);
+
+ back_ring = &d->arch.paging.cow.back_ring;
+ req_cons = back_ring->req_cons;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: xen_page_for_domain %d\n", d->domain_id);
+#endif
+
+ /* Get buffer page */
+ memcpy(&req, (RING_GET_REQUEST(back_ring, req_cons)), sizeof(req));
+ req_cons++;
+
+ back_ring->req_cons = req_cons;
+ back_ring->sring->req_event = req_cons + 1;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: num: %ld buffer mfn %" PRI_mfn "\n", (unsigned
long)req_cons, req.mfn);
+#endif
+
+ cow_ring_unlock(d);
+
+ return req.mfn;
+}
+
+static void cow_copy_page(struct domain *d, unsigned long guest_mfn,
+ void *guest_page)
+{
+ mfn_t gmfn;
+ unsigned long pfn;
+ unsigned long buffer_mfn;
+ void *buffer_page;
+ cow_response_t rsp;
+ cow_back_ring_t *back_ring;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: copy page: start\n");
+#endif
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ if ( guest_mfn != STATE_MFN )
+ {
+ gmfn = _mfn(guest_mfn);
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ BUG_ON(!VALID_M2P(pfn));
+ }
+ else
+ pfn = STATE_MFN;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: copy page: locking ring\n");
+#endif
+
+ buffer_mfn = cow_get_buffer_page(d);
+ buffer_page = map_domain_page(buffer_mfn);
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: copy page: mapped buffer page\n");
+#endif
+
+ /* Copy page */
+#if COW_DEBUG_OUTPUT
+ printk("cow: copy guest page\n");
+#endif
+ memcpy(buffer_page, guest_page, PAGE_SIZE);
+
+ PAGING_DEBUG(COW,
+ "copied page: mfn %" PRI_mfn
+ "; pfn %lx; page first chunk (%lx, %lx) from dom %d\n",
+ guest_mfn, pfn, *((unsigned long*)guest_page),
+ *((unsigned long*)buffer_page), d->domain_id);
+
+ /* Unmap pages */
+ unmap_domain_page(buffer_page);
+
+ /* Replace mfn in ring with pfn */
+ cow_ring_lock(d);
+
+ back_ring = &d->arch.paging.cow.back_ring;
+
+ rsp.pfn = pfn;
+ memcpy(RING_GET_RESPONSE(back_ring, back_ring->rsp_prod_pvt),
+ &rsp, sizeof(rsp));
+
+ /* Update number of pages copied */
+ back_ring->rsp_prod_pvt++;
+ RING_PUSH_RESPONSES(back_ring);
+
+ cow_ring_unlock(d);
+}
+
+/* Save a page into a buffer */
+static void cow_save_page(struct domain *d, unsigned long guest_mfn)
+{
+ void *guest_page;
+
+ guest_page = map_domain_page(guest_mfn);
+ cow_copy_page(d, guest_mfn, guest_page);
+ unmap_domain_page(guest_page);
+}
+
+static void cow_pause_domain(struct domain *d, unsigned long guest_mfn,
+ bool_t is_pre_dirty)
+{
+#if !COW_DOMAIN_PAUSE
+ struct vcpu *v;
+#endif
+
+ if ( d->arch.paging.cow.is_paused )
+ {
+ PAGING_DEBUG(COW,
+ "domain already paused domain %d; mfn: %" PRI_mfn "\n",
+ d->domain_id, guest_mfn);
+ return;
+ }
+
+ d->arch.paging.cow.is_paused = 1;
+ d->arch.paging.cow.is_paused_pre_dirty = is_pre_dirty;
+ d->arch.paging.cow.paused_guest_mfn = guest_mfn;
+
+ PAGING_DEBUG(COW,
+ "not enough buffer space, pausing domain %d; mfn: %"
+ PRI_mfn "\n", d->domain_id, guest_mfn);
+
+ printk("cow: pausing domain\n");
+
+#if COW_DOMAIN_PAUSE
+ domain_pause(d);
+#else
+ atomic_inc(&d->pause_count);
+
+ for_each_vcpu( d, v )
+// vcpu_pause_nosync(v);
+ vcpu_sleep_nosync(v);
+#endif
+
+ tasklet_schedule(&cow_notify_dom0_pause_tasklet);
+}
+
+static int cow_new_snapshot(struct domain *d)
+{
+ int ret;
+
+ ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap,
+ d->arch.paging.cow.bitmap_size);
+ if ( unlikely(ret != 0) )
+ goto free_log_dirty;
+
+ ret = paging_alloc_cow_bitmap(&d->arch.paging.cow.bitmap,
+ d->arch.paging.cow.bitmap_size);
+ if ( unlikely(ret != 0) )
+ goto free_precow_foreign;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow enabled for dom %d\n", d->domain_id);
+#endif
+
+ return 0;
+
+ free_precow_foreign:
+ paging_free_cow_bitmap(&d->arch.paging.cow.precow_foreign_bitmap);
+ free_log_dirty:
+ /* FIXME: This probably shouldn't be here any more... */
+ paging_free_log_dirty_bitmap(d);
+ return ret;
+}
+
+/* Check to make sure there's enough space in the buffer to continue */
+static int cow_check_threshold(struct domain *d)
+{
+ RING_IDX req_prod;
+ RING_IDX req_cons;
+ RING_IDX free_slots;
+
+ req_prod = d->arch.paging.cow.back_ring.sring->req_prod;
+ req_cons = d->arch.paging.cow.back_ring.req_cons;
+
+ if ( unlikely(d->arch.paging.cow.is_paused) )
+ {
+ printk("cow_paging: check_threshold: domain still paused\n");
+ return -EBUSY;
+ }
+
+ free_slots = req_prod - req_cons;
+
+ if ( unlikely(free_slots < XEN_COW_RING_THRESHOLD) )
+ {
+ printk("cow_paging: check_threshold: no space left: req_prod = %d;"
+ "req_cons = %d; free_slots = %d\n",
+ req_prod, req_cons, free_slots);
+ return -ENOSPC;
+ }
+
+ /* Notify ring buffer consumer that we've crossed the high water mark */
+ if ( !d->arch.paging.cow.notified_high_water
+ && (free_slots < d->arch.paging.cow.ring_high_water) )
+ {
+#if COW_DEBUG_OUTPUT
+ printk("cow: check_threshold: passed high water mark\n");
+#endif
+
+ d->arch.paging.cow.notified_high_water = 1;
+ cow_notify_dom0_high_water_tasklet.data = (unsigned
long)&d->arch.paging.cow.notified_high_water;
+ tasklet_schedule(&cow_notify_dom0_high_water_tasklet);
+ }
+
+ return 0;
+}
+
+static int cow_save_state(struct domain *d)
+{
+ struct vcpu *v;
+ void *vcpu_page;
+ int i = 0;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: start\n");
+#endif
+
+ vcpu_page = xmalloc_bytes(PAGE_SIZE);
+ if ( unlikely(vcpu_page == NULL) )
+ return -ENOMEM;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: allocated page\n");
+#endif
+
+ memset(vcpu_page, 0, PAGE_SIZE);
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: cleared page\n");
+#endif
+
+ /* Save state for each vcpu */
+ for_each_vcpu(d, v)
+ {
+ void *p = vcpu_page + (i * sizeof(v->arch.guest_context.user_regs));
+ memcpy(p, &v->arch.guest_context.user_regs,
+ sizeof(v->arch.guest_context.user_regs));
+
+ i++;
+ }
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: copied CPU info\n");
+#endif
+
+#if 0
+ cow_copy_page(d, STATE_MFN, d->shared_info);
+#endif
+ cow_copy_page(d, STATE_MFN, vcpu_page);
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: copied pages\n");
+#endif
+
+ xfree(vcpu_page);
+
+#if COW_DEBUG_OUTPUT
+ printk("cow: save state: done\n");
+#endif
+
+ return 0;
+}
+
+static int cow_scan_foreign_mapping(struct domain *d)
+{
+ RING_IDX req_prod;
+ RING_IDX req_cons;
+ int free_slots;
+ unsigned int num_entries;
+ unsigned int i;
+ int ret;
+
+ ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+
+ /* Get the grant table lock */
+ spin_lock(&d->grant_table->lock);
+
+ /* Find active entires */
+ num_entries = 0;
+ for ( i = 0; i < nr_grant_entries(d->grant_table); i++ )
+ {
+ struct active_grant_entry *act = &active_entry(d->grant_table, i);
+
+ /* XXX: Is pin guaranteed to be zero for an inactive grant? */
+ /* XXX: Do I need to worry about device mapping? */
+ if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask )
+ num_entries++;
+ }
+
+ /* Make sure there's enough buffer space for this */
+ req_prod = d->arch.paging.cow.back_ring.sring->req_prod;
+ req_cons = d->arch.paging.cow.back_ring.req_cons;
+ free_slots = req_prod - req_cons;
+
+ ret = -ENOSPC;
+ if ( unlikely(free_slots < num_entries + XEN_COW_RING_THRESHOLD) )
+ {
+ printk("cow_paging: scan_foreign: not enough space left\n");
+ d->arch.paging.cow.is_paused_scan_foreign = 1;
+ goto out;
+ }
+
+ /* For each entry in the active list, save the page */
+ for ( i = 0; i < nr_grant_entries(d->grant_table); i++ )
+ {
+ struct active_grant_entry *act = &active_entry(d->grant_table, i);
+
+ if ( act->pin & GNTPIN_hstw_mask || act->pin & GNTPIN_devw_mask )
+ {
+ mfn_t gmfn;
+ unsigned long pfn;
+
+ gmfn = _mfn(act->frame);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ ASSERT(VALID_M2P(pfn));
+ ASSERT(mfn_valid(gmfn));
+
+ /* Set the bit in the precow bitmap */
+#if 1
+ __set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+#else
+ set_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+#endif
+
+ /*
+ * If we have mapping with other domain, we won't be able
+ * to coordinate with them so just save page to be safe
+ */
+ cow_save_page(d, act->frame);
+ }
+ }
+
+ ret = 0;
+ out:
+ /* Release lock */
+ spin_unlock(&d->grant_table->lock);
+ return ret;
+}
+
+static int cow_take_snapshot(struct domain *d)
+{
+ int ret;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow checking threshold\n");
+#endif
+
+ ret = cow_check_threshold(d);
+ if ( unlikely(ret != 0) )
+ return ret;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow new snapshot\n");
+#endif
+
+ ret = cow_new_snapshot(d);
+ if ( unlikely(ret != 0) )
+ return ret;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow saving state\n");
+#endif
+
+ ret = cow_save_state(d);
+ if ( unlikely(ret != 0) )
+ return ret;
+
+#if COW_DEBUG_OUTPUT
+ printk("cow scan foreign\n");
+#endif
+
+ /*
+ * Fill the precow bitmap by scanning the active grant list.
+ * We are racing with devices here, so we need to coordinate
+ * We will probably only coordinate with dom0.
+ */
+ cow_scan_foreign_mapping(d);
+
+#if COW_DEBUG_OUTPUT
+ printk("cow snapshot taken\n");
+#endif
+
+ return 0;
+}
+
+void cow_init(struct domain *d)
+{
+ cow_lock_init(d);
+ cow_ring_lock_init(d);
+ disable_cow(d);
+}
+
+void cow_teardown(struct domain *d)
+{
+ cow_lock(d);
+ paging_free_cow(d);
+ cow_unlock(d);
+}
+
+int cow_enable(struct domain *d)
+{
+ int ret;
+
+ cow_lock(d);
+
+ ret = -EINVAL;
+ if ( cow_enabled(d) )
+ goto out;
+
+ d->arch.paging.cow.bitmap_size =
+ (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
+
+ /* 50% high water mark */
+ d->arch.paging.cow.ring_high_water =
RING_SIZE(&d->arch.paging.cow.back_ring) >> 1;
+ d->arch.paging.cow.notified_high_water = 0;
+
+ d->arch.paging.cow.is_paused = 0;
+ d->arch.paging.cow.is_paused_pre_dirty = 0;
+ d->arch.paging.cow.is_paused_scan_foreign = 0;
+ d->arch.paging.cow.paused_guest_mfn = 0;
+
+ enable_cow(d);
+
+ ret = 0;
+
+ out:
+ cow_unlock(d);
+ return ret;
+}
+
+void cow_disable(struct domain *d)
+{
+ printk("cow: disable cow for domain %d\n", d->domain_id);
+
+ disable_cow(d);
+
+ cow_lock(d);
+ paging_free_cow(d);
+ cow_unlock(d);
+}
+
+/* Take proper action when a page is mapped writable in a foreign domain */
+void cow_pre_dirty(struct domain *d, unsigned long guest_mfn)
+{
+ unsigned long pfn;
+ mfn_t gmfn;
+ int rc;
+
+ gmfn = _mfn(guest_mfn);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ BUG_ON(!VALID_M2P(pfn));
+
+ cow_lock(d);
+
+ ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+ ASSERT(d->arch.paging.cow.bitmap != NULL);
+ BUG_ON( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) );
+
+ if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+ {
+#if COW_DEBUG_OUTPUT
+ printk("cow: pre dirty: mfn = %lx\n", guest_mfn);
+#endif
+
+ rc = cow_check_threshold(d);
+ if ( rc != 0 )
+ {
+ __clear_bit(pfn, d->arch.paging.cow.bitmap);
+ cow_pause_domain(d, guest_mfn, 1);
+ goto out;
+ }
+
+ cow_save_page(d, guest_mfn);
+ }
+
+ out:
+ cow_unlock(d);
+}
+
+void cow_mark_dirty(struct domain *d, unsigned long guest_mfn)
+{
+ unsigned long pfn;
+ mfn_t gmfn;
+ int rc;
+
+ cow_lock(d);
+
+ ASSERT(d->arch.paging.cow.precow_foreign_bitmap != NULL);
+ ASSERT(d->arch.paging.cow.bitmap != NULL);
+
+ gmfn = _mfn(guest_mfn);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ /*
+ * Values with the MSB set denote MFNs that aren't really part of the
+ * domain's pseudo-physical memory map (e.g., the shared info frame).
+ * Nothing to do here...
+ */
+ if ( unlikely(!VALID_M2P(pfn)) )
+ goto out;
+
+ /* Test saved_page bitmap */
+#if 0
+#if COW_DEBUG_OUTPUT
+ if ( test_bit(pfn, d->arch.paging.cow.bitmap) )
+ printk("cow: already marked dirty: mfn = %lx\n", guest_mfn);
+#endif
+#endif
+
+ /* Test precow bitmap */
+ if ( test_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap) )
+ {
+ /*
+ * This is either a ring page(ok) or
+ * the guest is racing with a device to
+ * write to the page but since we are racing
+ * with a device, we can really save the page either.
+ * In the latter case, the checkpoint will most
+ * likely be broken.
+ */
+ PAGING_DEBUG(COW,
+ "write to precow foreign page %" PRI_mfn
+ " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+
+ /*
+ * We clear this bit, since the state of the page is now defined
+ * and part of the snapshot, so we want to protect the page if we
+ * write to it.
+ */
+ __clear_bit(pfn, d->arch.paging.cow.precow_foreign_bitmap);
+
+ if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+ {
+ PAGING_DEBUG(COW,
+ "marked precow foreign mfn %"
+ PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ }
+ }
+ else if ( !__test_and_set_bit(pfn, d->arch.paging.cow.bitmap) )
+ {
+ /* Save the page */
+#if COW_DEBUG_OUTPUT
+ printk("cow: mark dirty: mfn = %lx\n", guest_mfn);
+#endif
+ rc = cow_check_threshold(d);
+ if ( rc != 0 )
+ {
+ __clear_bit(pfn, d->arch.paging.cow.bitmap);
+ cow_pause_domain(d, guest_mfn, 0);
+ goto out;
+ }
+
+ cow_save_page(d, guest_mfn);
+ }
+
+ out:
+ cow_unlock(d);
+}
+
+int cow_snapshot(struct domain *d, RING_IDX *when)
+{
+ int ret;
+
+ /* FIXME: Try not to pause/unpause all the time */
+ domain_pause(d);
+
+ /* FIXME: Try not to disable/enable log dirty all the time */
+ if ( cow_enabled(d) )
+ paging_log_dirty_disable(d);
+
+ if ( !cow_enabled(d) )
+ {
+ ret = paging_log_dirty_enable(d, 1);
+ if ( ret != 0 )
+ goto out;
+ }
+
+ cow_lock(d);
+ *when = d->arch.paging.cow.back_ring.sring->rsp_prod;
+#if COW_DEBUG_OUTPUT
+ printk("cow: snapshot: when = %d\n", *when);
+#endif
+ ret = cow_take_snapshot(d);
+#if COW_DEBUG_OUTPUT
+ printk("cow: snapshot: took snapshot = %d\n", ret);
+#endif
+ cow_unlock(d);
+
+ out:
+ domain_unpause(d);
+
+ return ret;
+}
+
+int cow_resume(struct domain *d)
+{
+#if !COW_DOMAIN_PAUSE
+ struct vcpu *v;
+#endif
+ int ret;
+
+ cow_lock(d);
+
+ ret = -EINVAL;
+ if ( !cow_enabled(d) )
+ goto out;
+
+ if ( d->arch.paging.cow.is_paused == 0 )
+ {
+ ret = 0;
+ goto out;
+ }
+
+ d->arch.paging.cow.is_paused = 0;
+ ret = cow_check_threshold(d);
+ if ( ret != 0 )
+ {
+ d->arch.paging.cow.is_paused = 1;
+ goto out;
+ }
+
+ if ( d->arch.paging.cow.is_paused_pre_dirty )
+ {
+ d->arch.paging.cow.is_paused_pre_dirty = 0;
+ cow_unlock(d);
+ cow_pre_dirty(d, d->arch.paging.cow.paused_guest_mfn);
+ }
+ else if ( d->arch.paging.cow.is_paused_scan_foreign )
+ {
+ d->arch.paging.cow.is_paused_scan_foreign = 0;
+ cow_scan_foreign_mapping(d);
+ cow_unlock(d);
+ }
+ else
+ {
+ cow_unlock(d);
+ cow_mark_dirty(d, d->arch.paging.cow.paused_guest_mfn);
+ }
+
+ d->arch.paging.cow.paused_guest_mfn = 0;
+
+#if COW_DOMAIN_PAUSE
+ domain_unpause(d);
+#else
+ if ( atomic_dec_and_test(&d->pause_count) )
+ for_each_vcpu( d, v )
+// vcpu_unpause(v);
+ vcpu_wake(v);
+#endif
+
+ return 0;
+
+ out:
+ cow_unlock(d);
+ return ret;
+}
diff -r 0477f9061c8a xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/paging.c Mon Apr 20 10:21:49 2009 -0700
@@ -26,8 +26,10 @@
#include <asm/p2m.h>
#include <asm/hap.h>
#include <asm/guest_access.h>
+#include <asm/cow.h>
#include <xen/numa.h>
#include <xsm/xsm.h>
+#include <xen/grant_table.h>
#define hap_enabled(d) (is_hvm_domain(d) && (d)->arch.hvm_domain.hap_enabled)
@@ -158,7 +160,7 @@
{
d->arch.paging.log_dirty.allocs--;
free_domheap_page(mfn_to_page(mfn));
-}
+}
void paging_free_log_dirty_bitmap(struct domain *d)
{
@@ -207,7 +209,7 @@
d->arch.paging.log_dirty.failed_allocs = 0;
}
-int paging_log_dirty_enable(struct domain *d)
+int paging_log_dirty_enable(struct domain *d, bool_t enable_cow)
{
int ret;
@@ -226,6 +228,9 @@
paging_free_log_dirty_bitmap(d);
goto out;
}
+
+ if ( enable_cow )
+ cow_enable(d);
log_dirty_unlock(d);
@@ -253,11 +258,33 @@
ret = d->arch.paging.log_dirty.disable_log_dirty(d);
log_dirty_lock(d);
if ( !paging_mode_log_dirty(d) )
+ {
paging_free_log_dirty_bitmap(d);
+
+ if ( cow_enabled(d) )
+ cow_disable(d);
+ }
log_dirty_unlock(d);
domain_unpause(d);
return ret;
+}
+
+void paging_pre_dirty(struct domain *d, unsigned long guest_mfn)
+{
+ mfn_t gmfn;
+
+ gmfn = _mfn(guest_mfn);
+
+ if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) )
+ return;
+
+ log_dirty_lock(d);
+
+ if ( cow_enabled(d) )
+ cow_pre_dirty(d, guest_mfn);
+
+ log_dirty_unlock(d);
}
/* Mark a page as dirty */
@@ -327,11 +354,14 @@
unmap_domain_page(l1);
if ( changed )
{
- PAGING_DEBUG(LOGDIRTY,
+ PAGING_DEBUG(LOGDIRTY,
"marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
mfn_x(gmfn), pfn, d->domain_id);
d->arch.paging.log_dirty.dirty_count++;
}
+
+ if ( cow_enabled(d) )
+ cow_mark_dirty(d, guest_mfn);
out:
log_dirty_unlock(d);
@@ -471,13 +501,20 @@
d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty;
d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+
+ cow_init(d);
}
/* This function fress log dirty bitmap resources. */
void paging_log_dirty_teardown(struct domain*d)
{
log_dirty_lock(d);
+
paging_free_log_dirty_bitmap(d);
+
+ if ( cow_enabled(d) )
+ cow_teardown(d);
+
log_dirty_unlock(d);
}
/************************************************/
@@ -552,11 +589,11 @@
switch ( sc->op )
{
case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
- return paging_log_dirty_enable(d);
+ return paging_log_dirty_enable(d, 0);
case XEN_DOMCTL_SHADOW_OP_ENABLE:
if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
- return paging_log_dirty_enable(d);
+ return paging_log_dirty_enable(d, 0);
case XEN_DOMCTL_SHADOW_OP_OFF:
if ( paging_mode_log_dirty(d) )
diff -r 0477f9061c8a xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c Mon Apr 20 10:21:49 2009 -0700
@@ -36,6 +36,7 @@
#include <asm/hvm/cacheattr.h>
#include <asm/mtrr.h>
#include <asm/guest_pt.h>
+#include <asm/paging.h>
#include "private.h"
#include "types.h"
@@ -4598,6 +4599,8 @@
}
#endif
+ paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn1));
+
/* Unaligned writes mean probably this isn't a pagetable */
if ( vaddr & (bytes - 1) )
sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ );
@@ -4623,6 +4626,8 @@
MAPPING_EXCEPTION :
(mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ?
MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE);
+
+ paging_pre_dirty(v->domain, mfn_x(sh_ctxt->mfn2));
/* Cross-page writes mean probably not a pagetable */
sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ );
diff -r 0477f9061c8a xen/common/grant_table.c
--- a/xen/common/grant_table.c Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/common/grant_table.c Mon Apr 20 10:21:49 2009 -0700
@@ -352,6 +352,15 @@
goto undo_out;
}
+ /*
+ * If the mapping is writable, do something before the page is mapped.
+ * We may end up doing something useless if the mapping fails but
+ * otherwise we could end up racing with the guest
+ * (altough very unlikely)
+ */
+ if ( !(op->flags & GNTMAP_readonly) )
+ gnttab_pre_dirty(rd, frame);
+
rc = create_grant_host_mapping(
op->host_addr, frame, op->flags, cache_flags);
if ( rc != GNTST_okay )
@@ -582,7 +591,7 @@
/* If just unmapped a writable mapping, mark as dirtied */
if ( !(op->flags & GNTMAP_readonly) )
- gnttab_mark_dirty(rd, op->frame);
+ gnttab_post_dirty(rd, op->frame);
unmap_out:
op->status = rc;
@@ -1255,7 +1264,7 @@
}
else
{
- gnttab_mark_dirty(rd, r_frame);
+ gnttab_post_dirty(rd, r_frame);
act->pin -= GNTPIN_hstw_inc;
if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) )
@@ -1444,6 +1453,8 @@
goto error_out;
}
+ gnttab_pre_dirty(dd, d_frame);
+
sp = map_domain_page(s_frame);
dp = map_domain_page(d_frame);
@@ -1452,7 +1463,7 @@
unmap_domain_page(dp);
unmap_domain_page(sp);
- gnttab_mark_dirty(dd, d_frame);
+ gnttab_post_dirty(dd, d_frame);
put_page_and_type(mfn_to_page(d_frame));
error_out:
diff -r 0477f9061c8a xen/include/asm-x86/cow.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/cow.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * include/asm-x86/cow.h
+ *
+ * Common interface for cow support.
+ *
+ * Copyright (c) 2009 University of British Columbia (Patrick Colp)
+ * Parts based on earlier work by Geoffrey Lefebvre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#ifndef __COW_H__
+#define __COW_H__
+
+
+#include <xen/sched.h>
+
+
+/* Flag used for CoW debug */
+#define PAGING_DEBUG_COW 1
+
+
+/* CoW helper functions */
+#define cow_enabled(_d) ((_d)->is_cow)
+#define enable_cow(_d) ((_d)->is_cow = 1)
+#define disable_cow(_d) ((_d)->is_cow = 0)
+
+/* CoW lock */
+#define cow_ring_lock_init(_d)
spin_lock_init(&(_d)->arch.paging.cow.ring_lock)
+#define cow_ring_lock(_d) spin_lock(&(_d)->arch.paging.cow.ring_lock)
+#define cow_ring_unlock(_d) spin_unlock(&(_d)->arch.paging.cow.ring_lock)
+
+
+/* Enable CoW */
+int cow_enable(struct domain *d);
+
+/* Disable CoW */
+void cow_disable(struct domain *d);
+
+/* CoW initialisation */
+void cow_init(struct domain *d);
+
+/* CoW teardown */
+void cow_teardown(struct domain *d);
+
+/* Take a CoW snapshot */
+int cow_snapshot(struct domain *d, RING_IDX *when);
+
+/* Resume a domain paused because of CoW (buffer was full) */
+int cow_resume(struct domain *d);
+
+/* We use the mapping and unmaping of the page as conservative boundary
+ * on the page being written to by the foreign domain */
+void cow_pre_dirty(struct domain *d, unsigned long guest_mfn);
+
+/* Copy pages out and mark them as dirty so they don't get copied again */
+void cow_mark_dirty(struct domain *d, unsigned long guest_mfn);
+
+
+#endif /* __COW_H__ */
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/domain.h Mon Apr 20 10:21:49 2009 -0700
@@ -6,6 +6,7 @@
#include <asm/hvm/vcpu.h>
#include <asm/hvm/domain.h>
#include <asm/e820.h>
+#include <public/io/cow.h>
#define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo)
#define is_pv_32bit_domain(d) ((d)->arch.is_32bit_pv)
@@ -149,6 +150,41 @@
};
/************************************************/
+/* copy-on-write */
+/************************************************/
+struct cow_domain {
+ /* cow lock */
+ spinlock_t lock;
+ int locker; /* processor that holds the lock */
+ const char *locker_function; /* func that took it */
+
+ /* ring lock */
+ spinlock_t ring_lock;
+
+ /* size of the cow bitmaps */
+ unsigned long bitmap_size;
+
+ /* cow bitmap to record foreign pages before cow was enabled */
+ unsigned long *precow_foreign_bitmap;
+
+ /* cow bitmap to record pages that have been saved */
+ unsigned long *bitmap;
+
+ /* back-end ring for reading mfns and storing pfns */
+ cow_back_ring_t back_ring;
+
+ /* high water mark for ring */
+ RING_IDX ring_high_water;
+ bool_t notified_high_water;
+
+ /* paused domain */
+ bool_t is_paused;
+ bool_t is_paused_pre_dirty;
+ bool_t is_paused_scan_foreign;
+ unsigned long paused_guest_mfn;
+};
+
+/************************************************/
/* common paging data structure */
/************************************************/
struct log_dirty_domain {
@@ -181,6 +217,8 @@
struct hap_domain hap;
/* log dirty support */
struct log_dirty_domain log_dirty;
+ /* cow support */
+ struct cow_domain cow;
};
struct paging_vcpu {
diff -r 0477f9061c8a xen/include/asm-x86/grant_table.h
--- a/xen/include/asm-x86/grant_table.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/grant_table.h Mon Apr 20 10:21:49 2009 -0700
@@ -31,7 +31,8 @@
#define gnttab_shared_gmfn(d, t, i) \
(mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
-#define gnttab_mark_dirty(d, f) paging_mark_dirty((d), (f))
+#define gnttab_pre_dirty(d, f) paging_pre_dirty((d), (f))
+#define gnttab_post_dirty(d, f) paging_mark_dirty((d), (f))
static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
{
diff -r 0477f9061c8a xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/asm-x86/paging.h Mon Apr 20 10:21:49 2009 -0700
@@ -140,7 +140,7 @@
void paging_free_log_dirty_bitmap(struct domain *d);
/* enable log dirty */
-int paging_log_dirty_enable(struct domain *d);
+int paging_log_dirty_enable(struct domain *d, bool_t enable_cow);
/* disable log dirty */
int paging_log_dirty_disable(struct domain *d);
@@ -152,6 +152,7 @@
void (*clean_dirty_bitmap)(struct domain *d));
/* mark a page as dirty */
+void paging_pre_dirty(struct domain *d, unsigned long guest_mfn);
void paging_mark_dirty(struct domain *d, unsigned long guest_mfn);
/*
diff -r 0477f9061c8a xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/public/domctl.h Mon Apr 20 10:21:49 2009 -0700
@@ -33,6 +33,7 @@
#endif
#include "xen.h"
+#include "io/ring.h"
#define XEN_DOMCTL_INTERFACE_VERSION 0x00000005
@@ -645,6 +646,41 @@
} xen_domctl_hvmcontext_partial_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+/* FIXME: use types instead of different domctls */
+/*
+ * Enable/disable Copy-on-write for a domain.
+ */
+#define XEN_DOMCTL_cow_enable 56
+#define XEN_DOMCTL_cow_snapshot 57
+#define XEN_DOMCTL_cow_resume 58
+#define XEN_DOMCTL_cow_disable 59
+#define XEN_DOMCTL_cow_page_type 60
+
+struct xen_domctl_cow_enable {
+ /* IN: mfn of the ring buffer */
+ unsigned long mfn;
+};
+typedef struct xen_domctl_cow_enable xen_domctl_cow_enable_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_enable_t);
+
+struct xen_domctl_cow_snapshot {
+ /* OUT: when the snapshot took place (rsp_prod) */
+ RING_IDX when;
+};
+typedef struct xen_domctl_cow_snapshot xen_domctl_cow_snapshot_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_snapshot_t);
+
+struct xen_domctl_cow_page_type {
+ /* IN: mfn of the page */
+ unsigned long mfn;
+ /* OUT: count info */
+ unsigned long count_info;
+ /* OUT: page type info */
+ unsigned long type_info;
+};
+typedef struct xen_domctl_cow_page_type xen_domctl_cow_page_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cow_page_type_t);
+
struct xen_domctl {
uint32_t cmd;
@@ -687,6 +723,9 @@
struct xen_domctl_set_target set_target;
struct xen_domctl_subscribe subscribe;
struct xen_domctl_debug_op debug_op;
+ struct xen_domctl_cow_enable cow_enable;
+ struct xen_domctl_cow_snapshot cow_snapshot;
+ struct xen_domctl_cow_page_type cow_page_type;
#if defined(__i386__) || defined(__x86_64__)
struct xen_domctl_cpuid cpuid;
#endif
diff -r 0477f9061c8a xen/include/public/io/cow.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/io/cow.h Mon Apr 20 10:21:49 2009 -0700
@@ -0,0 +1,82 @@
+/*****************************************************************************
+ * cow.h
+ *
+ * CoW common structures
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (C) 2009 University of British Columbia (Patrick Colp)
+ */
+
+#ifndef _XEN_PUBLIC_IO_COW_H
+#define _XEN_PUBLIC_IO_COW_H
+
+
+#include "ring.h"
+
+
+#define RING_MASK(_r, _i) ((_i) & (RING_SIZE(_r) - 1))
+
+
+#define XEN_COW_IOC_MAGIC 'w'
+#define XEN_COW_IOCTL_INIT _IO(XEN_COW_IOC_MAGIC, 1)
+
+#define XEN_COW_RING_PAGES 1 /* TODO: 2+ pages? */
+#define XEN_COW_RING_SIZE (XEN_COW_RING_PAGES << PAGE_SHIFT)
+
+#define XEN_COW_RING_THRESHOLD 16
+
+
+/* Some definitions for the XenCow ring buffer. */
+typedef struct cow_request_st {
+ unsigned long mfn;
+} cow_request_t;
+
+typedef struct cow_response_st {
+ unsigned long pfn;
+} cow_response_t;
+
+
+DEFINE_RING_TYPES(cow, cow_request_t, cow_response_t);
+
+
+/*
+ * The structure used to initialise CoW.
+ */
+typedef struct cow_init_st {
+ /* Start address of buffer */
+ unsigned long addr;
+ /* Number of frames in buffer */
+ int num_mfns;
+ /* MFNs of buffer frames */
+ unsigned long mfns[];
+} cow_init_t;
+
+
+#endif /* _XEN_PUBLIC_IO_COW_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0477f9061c8a xen/include/public/xen.h
--- a/xen/include/public/xen.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/public/xen.h Mon Apr 20 10:21:49 2009 -0700
@@ -143,6 +143,8 @@
#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */
#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */
#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */
+#define VIRQ_COW_BUFFER 9 /* G. (DOM0) CoW buffer has pages available */
+#define VIRQ_COW_PAUSE 10 /* G. (DOM0) CoW domain has been paused */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
diff -r 0477f9061c8a xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Fri Mar 20 17:42:46 2009 +0000
+++ b/xen/include/xen/sched.h Mon Apr 20 10:21:49 2009 -0700
@@ -223,6 +223,8 @@
bool_t is_paused_by_controller;
/* Domain's VCPUs are pinned 1:1 to physical CPUs? */
bool_t is_pinned;
+ /* Is this guest doing CoW? */
+ bool_t is_cow;
/* Are any VCPUs polling event channels (SCHEDOP_poll)? */
DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|