# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 684fdcfb251a443fa885c142b427d253ec033212
# Parent 896fcdd49c7ff59f7d28b6402fd4453e60c38232
# Parent f790546ecfda03193a4b8983f7bb6b0f65924603
merge with xen-unstable.hg
---
xen/arch/x86/shadow2-common.c | 3407 ---------------
xen/arch/x86/shadow2.c | 4492 ---------------------
xen/include/asm-x86/page-guest32.h | 105
xen/include/asm-x86/shadow2-multi.h | 116
xen/include/asm-x86/shadow2-private.h | 593 --
xen/include/asm-x86/shadow2-types.h | 692 ---
xen/include/asm-x86/shadow2.h | 626 --
docs/man/xend-config.sxp.pod.5 | 2
docs/misc/xend.tex | 4
docs/src/user.tex | 4
tools/Makefile | 1
tools/console/daemon/io.c | 18
tools/examples/vif-route | 6
tools/examples/xen-hotplug-common.sh | 2
tools/examples/xen-network-common.sh | 40
tools/examples/xend-config.sxp | 2
tools/firmware/hvmloader/smbios.c | 12
tools/firmware/hvmloader/util.c | 54
tools/firmware/hvmloader/util.h | 10
tools/ioemu/Makefile | 2
tools/ioemu/patches/qemu-logging | 1
tools/ioemu/patches/xen-build | 14
tools/ioemu/vl.c | 2
tools/libxc/xc_hvm_build.c | 2
tools/misc/xend | 2
tools/python/xen/util/bugtool.py | 4
tools/python/xen/xend/XendRoot.py | 2
tools/python/xen/xend/server/params.py | 4
tools/security/python/xensec_gen/main.py | 2
unmodified_drivers/linux-2.6/platform-pci/evtchn.c | 2
xen/arch/x86/Makefile | 21
xen/arch/x86/domain.c | 46
xen/arch/x86/domain_build.c | 8
xen/arch/x86/domctl.c | 2
xen/arch/x86/hvm/hvm.c | 6
xen/arch/x86/hvm/platform.c | 4
xen/arch/x86/hvm/svm/svm.c | 183
xen/arch/x86/hvm/svm/vmcb.c | 2
xen/arch/x86/hvm/vmx/vmcs.c | 4
xen/arch/x86/hvm/vmx/vmx.c | 20
xen/arch/x86/mm.c | 142
xen/arch/x86/mm/Makefile | 1
xen/arch/x86/mm/shadow/Makefile | 15
xen/arch/x86/mm/shadow/common.c | 3407 +++++++++++++++
xen/arch/x86/mm/shadow/multi.c | 4492 +++++++++++++++++++++
xen/arch/x86/mm/shadow/multi.h | 116
xen/arch/x86/mm/shadow/page-guest32.h | 105
xen/arch/x86/mm/shadow/private.h | 593 ++
xen/arch/x86/mm/shadow/types.h | 692 +++
xen/arch/x86/traps.c | 8
xen/include/asm-x86/domain.h | 18
xen/include/asm-x86/hvm/svm/vmcb.h | 45
xen/include/asm-x86/mm.h | 82
xen/include/asm-x86/perfc_defn.h | 102
xen/include/asm-x86/shadow.h | 614 ++
55 files changed, 10488 insertions(+), 10463 deletions(-)
diff -r 896fcdd49c7f -r 684fdcfb251a docs/man/xend-config.sxp.pod.5
--- a/docs/man/xend-config.sxp.pod.5 Mon Aug 28 16:16:07 2006 -0600
+++ b/docs/man/xend-config.sxp.pod.5 Mon Aug 28 16:26:37 2006 -0600
@@ -23,7 +23,7 @@ The following lists the daemon configura
=item I<logfile>
The location of the file to record runtime log messages. Defaults to
-I</var/log/xend.log>.
+I</var/log/xen/xend.log>.
=item I<loglevel>
diff -r 896fcdd49c7f -r 684fdcfb251a docs/misc/xend.tex
--- a/docs/misc/xend.tex Mon Aug 28 16:16:07 2006 -0600
+++ b/docs/misc/xend.tex Mon Aug 28 16:26:37 2006 -0600
@@ -214,7 +214,7 @@ Configuration scripts ({\it e.g.} for ne
Configuration scripts ({\it e.g.} for network-script) are looked for in {\tt
/etc/xen}
unless their name begins with '/'.
-Xend sends its log output to {\tt /var/log/xend.log}. This is a rotating
logfile,
+Xend sends its log output to {\tt /var/log/xen/xend.log}. This is a rotating
logfile,
and logs are moved onto {\tt xend.log.1} {\it etc.} as they get large. Old
logs may
be deleted.
@@ -411,7 +411,7 @@ allows access to some debugging function
\end{itemize}
When tracing is on xend logs all functions calls and exceptions to
-{\tt /var/log/xend.trace}.
+{\tt /var/log/xen/xend.trace}.
\begin{thebibliography}{99}
diff -r 896fcdd49c7f -r 684fdcfb251a docs/src/user.tex
--- a/docs/src/user.tex Mon Aug 28 16:16:07 2006 -0600
+++ b/docs/src/user.tex Mon Aug 28 16:26:37 2006 -0600
@@ -973,8 +973,8 @@ using the \texttt{xm} tool.
\subsection{Logging}
-As \xend\ runs, events will be logged to \path{/var/log/xend.log} and
-(less frequently) to \path{/var/log/xend-debug.log}. These, along with
+As \xend\ runs, events will be logged to \path{/var/log/xen/xend.log} and
+(less frequently) to \path{/var/log/xen/xend-debug.log}. These, along with
the standard syslog files, are useful when troubleshooting problems.
\subsection{Configuring \Xend\ }
diff -r 896fcdd49c7f -r 684fdcfb251a tools/Makefile
--- a/tools/Makefile Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/Makefile Mon Aug 28 16:26:37 2006 -0600
@@ -39,6 +39,7 @@ install: check
done
$(MAKE) ioemuinstall
$(INSTALL_DIR) -p $(DESTDIR)/var/xen/dump
+ $(INSTALL_DIR) -p $(DESTDIR)/var/log/xen
.PHONY: clean
clean: check_clean
diff -r 896fcdd49c7f -r 684fdcfb251a tools/console/daemon/io.c
--- a/tools/console/daemon/io.c Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/console/daemon/io.c Mon Aug 28 16:26:37 2006 -0600
@@ -584,16 +584,14 @@ void handle_io(void)
FD_ISSET(xc_evtchn_fd(d->xce_handle), &readfds))
handle_ring_read(d);
- if (d->tty_fd != -1) {
- if (FD_ISSET(d->tty_fd, &readfds))
- handle_tty_read(d);
-
- if (FD_ISSET(d->tty_fd, &writefds))
- handle_tty_write(d);
-
- if (d->is_dead)
- cleanup_domain(d);
- }
+ if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &readfds))
+ handle_tty_read(d);
+
+ if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &writefds))
+ handle_tty_write(d);
+
+ if (d->is_dead)
+ cleanup_domain(d);
}
} while (ret > -1);
}
diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/vif-route
--- a/tools/examples/vif-route Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/examples/vif-route Mon Aug 28 16:26:37 2006 -0600
@@ -30,10 +30,12 @@ case "$command" in
ifconfig ${vif} ${main_ip} netmask 255.255.255.255 up
echo 1 >/proc/sys/net/ipv4/conf/${vif}/proxy_arp
ipcmd='add'
+ cmdprefix=''
;;
offline)
- ifdown ${vif}
+ do_without_error ifdown ${vif}
ipcmd='del'
+ cmdprefix='do_without_error'
;;
esac
@@ -41,7 +43,7 @@ if [ "${ip}" ] ; then
# If we've been given a list of IP addresses, then add routes from dom0 to
# the guest using those addresses.
for addr in ${ip} ; do
- ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip}
+ ${cmdprefix} ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip}
done
fi
diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xen-hotplug-common.sh
--- a/tools/examples/xen-hotplug-common.sh Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/examples/xen-hotplug-common.sh Mon Aug 28 16:26:37 2006 -0600
@@ -21,7 +21,7 @@ dir=$(dirname "$0")
. "$dir/xen-script-common.sh"
. "$dir/locking.sh"
-exec 2>>/var/log/xen-hotplug.log
+exec 2>>/var/log/xen/xen-hotplug.log
export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
export LANG="POSIX"
diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xen-network-common.sh
--- a/tools/examples/xen-network-common.sh Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/examples/xen-network-common.sh Mon Aug 28 16:26:37 2006 -0600
@@ -44,34 +44,18 @@ then
}
elif ! which ifup >/dev/null 2>/dev/null
then
- if [ -e /etc/conf.d/net ]
- then
- preiftransfer()
- {
- true
- }
- ifup()
- {
- /etc/init.d/net.$1 start
- }
- ifdown()
- {
- /etc/init.d/net.$1 stop
- }
- else
- preiftransfer()
- {
- true
- }
- ifup()
- {
- false
- }
- ifdown()
- {
- false
- }
- fi
+ preiftransfer()
+ {
+ true
+ }
+ ifup()
+ {
+ false
+ }
+ ifdown()
+ {
+ false
+ }
else
preiftransfer()
{
diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/examples/xend-config.sxp Mon Aug 28 16:26:37 2006 -0600
@@ -11,7 +11,7 @@
# Commented out entries show the default for that entry, unless otherwise
# specified.
-#(logfile /var/log/xend.log)
+#(logfile /var/log/xen/xend.log)
#(loglevel DEBUG)
#(xend-http-server no)
diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/smbios.c
--- a/tools/firmware/hvmloader/smbios.c Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/firmware/hvmloader/smbios.c Mon Aug 28 16:26:37 2006 -0600
@@ -116,8 +116,10 @@ smbios_table_size(uint32_t vcpus, const
/* type 0: "Xen", xen_version, and release_date */
size += strlen("Xen") + strlen(xen_version) + 2;
- /* type 1: "Xen", xen_version, "HVM domU" */
- size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) + 3;
+ /* type 1: "Xen", xen_version, "HVM domU", UUID as string for
+ serial number */
+ size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) +
+ 36 + 4;
/* type 3: "Xen" */
size += strlen("Xen") + 1;
/* type 4: socket designation ("CPU n"), processor_manufacturer */
@@ -371,6 +373,7 @@ smbios_type_1_init(void *start, const ch
smbios_type_1_init(void *start, const char *xen_version,
uint8_t uuid[16])
{
+ char uuid_str[37];
struct smbios_type_1 *p = (struct smbios_type_1 *)start;
p->header.type = 1;
p->header.length = sizeof(struct smbios_type_1);
@@ -379,7 +382,7 @@ smbios_type_1_init(void *start, const ch
p->manufacturer_str = 1;
p->product_name_str = 2;
p->version_str = 3;
- p->serial_number_str = 0;
+ p->serial_number_str = 4;
memcpy(p->uuid, uuid, 16);
@@ -395,6 +398,9 @@ smbios_type_1_init(void *start, const ch
start += strlen("HVM domU") + 1;
strcpy((char *)start, xen_version);
start += strlen(xen_version) + 1;
+ uuid_to_string(uuid_str, uuid);
+ strcpy((char *)start, uuid_str);
+ start += strlen(uuid_str) + 1;
*((uint8_t *)start) = 0;
return start+1;
diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/firmware/hvmloader/util.c Mon Aug 28 16:26:37 2006 -0600
@@ -174,3 +174,57 @@ cpuid(uint32_t idx, uint32_t *eax, uint3
: "0" (idx) );
}
+/* Write a two-character hex representation of 'byte' to digits[].
+ Pre-condition: sizeof(digits) >= 2 */
+void
+byte_to_hex(char *digits, uint8_t byte)
+{
+ uint8_t nybbel = byte >> 4;
+
+ if (nybbel > 9)
+ digits[0] = 'a' + nybbel-10;
+ else
+ digits[0] = '0' + nybbel;
+
+ nybbel = byte & 0x0f;
+ if (nybbel > 9)
+ digits[1] = 'a' + nybbel-10;
+ else
+ digits[1] = '0' + nybbel;
+}
+
+/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID
+ string.
+
+ Pre-condition: sizeof(dest) >= 37 */
+void
+uuid_to_string(char *dest, uint8_t *uuid)
+{
+ int i = 0;
+ char *p = dest;
+
+ for (i = 0; i < 4; ++i) {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (i = 4; i < 6; ++i) {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (i = 6; i < 8; ++i) {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (i = 8; i < 10; ++i) {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+ *p++ = '-';
+ for (i = 10; i < 16; ++i) {
+ byte_to_hex(p, uuid[i]);
+ p += 2;
+ }
+}
diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/firmware/hvmloader/util.h Mon Aug 28 16:26:37 2006 -0600
@@ -25,6 +25,16 @@ void *memset(void *s, int c, unsigned n)
void *memset(void *s, int c, unsigned n);
char *itoa(char *a, unsigned int i);
+/* convert a byte to two lowercase hex digits, with no terminating NUL
+ character. digits[] must have at least two elements. */
+void byte_to_hex(char *digits, uint8_t byte);
+
+/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID
+ string.
+
+ Pre-condition: sizeof(dest) >= 37 */
+void uuid_to_string(char *dest, uint8_t *uuid);
+
/* Debug output */
void puts(const char *s);
diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/Makefile
--- a/tools/ioemu/Makefile Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/ioemu/Makefile Mon Aug 28 16:26:37 2006 -0600
@@ -94,7 +94,7 @@ test speed test2: all
$(MAKE) -C tests $@
TAGS:
- etags *.[ch] tests/*.[ch]
+ etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch]
cscope:
rm -f ./cscope.*
diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/patches/qemu-logging
--- a/tools/ioemu/patches/qemu-logging Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/ioemu/patches/qemu-logging Mon Aug 28 16:26:37 2006 -0600
@@ -43,7 +43,7 @@ Index: ioemu/vl.c
/* default mac address of the first network interface */
+ /* init debug */
-+ sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid());
++ sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid());
+ cpu_set_log_filename(qemu_dm_logfilename);
+ cpu_set_log(0);
+
diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/patches/xen-build
--- a/tools/ioemu/patches/xen-build Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/ioemu/patches/xen-build Mon Aug 28 16:26:37 2006 -0600
@@ -1,7 +1,7 @@ Index: ioemu/Makefile
Index: ioemu/Makefile
===================================================================
---- ioemu.orig/Makefile 2006-08-06 02:03:44.915543858 +0100
-+++ ioemu/Makefile 2006-08-06 02:11:33.461331417 +0100
+--- ioemu.orig/Makefile 2006-08-28 20:19:23.000000000 +0100
++++ ioemu/Makefile 2006-08-28 20:20:08.000000000 +0100
@@ -1,11 +1,14 @@
# Makefile for QEMU.
@@ -60,6 +60,15 @@ Index: ioemu/Makefile
ifndef CONFIG_WIN32
mkdir -p "$(DESTDIR)$(datadir)/keymaps"
for x in $(KEYMAPS); do \
+@@ -89,7 +94,7 @@
+ $(MAKE) -C tests $@
+
+ TAGS:
+- etags *.[ch] tests/*.[ch]
++ etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch]
+
+ cscope:
+ rm -f ./cscope.*
@@ -107,11 +112,11 @@
texi2dvi $<
@@ -76,8 +85,8 @@ Index: ioemu/Makefile
info: qemu-doc.info qemu-tech.info
Index: ioemu/Makefile.target
===================================================================
---- ioemu.orig/Makefile.target 2006-08-06 02:03:44.922543079 +0100
-+++ ioemu/Makefile.target 2006-08-06 02:09:22.320951557 +0100
+--- ioemu.orig/Makefile.target 2006-08-28 20:19:23.000000000 +0100
++++ ioemu/Makefile.target 2006-08-28 20:19:47.000000000 +0100
@@ -1,5 +1,8 @@
include config.mak
@@ -149,8 +158,8 @@ Index: ioemu/Makefile.target
include .depend
Index: ioemu/configure
===================================================================
---- ioemu.orig/configure 2006-08-06 02:03:45.783447220 +0100
-+++ ioemu/configure 2006-08-06 02:09:41.076860544 +0100
+--- ioemu.orig/configure 2006-08-28 20:19:23.000000000 +0100
++++ ioemu/configure 2006-08-28 20:19:47.000000000 +0100
@@ -18,8 +18,8 @@
# default parameters
diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/vl.c
--- a/tools/ioemu/vl.c Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/ioemu/vl.c Mon Aug 28 16:26:37 2006 -0600
@@ -5924,7 +5924,7 @@ int main(int argc, char **argv)
/* default mac address of the first network interface */
/* init debug */
- sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid());
+ sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid());
cpu_set_log_filename(qemu_dm_logfilename);
cpu_set_log(0);
diff -r 896fcdd49c7f -r 684fdcfb251a tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/libxc/xc_hvm_build.c Mon Aug 28 16:26:37 2006 -0600
@@ -441,7 +441,7 @@ static int xc_hvm_build_internal(int xc_
goto error_out;
}
- /* HVM domains must be put into shadow2 mode at the start of day */
+ /* HVM domains must be put into shadow mode at the start of day */
if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE,
NULL, 0, NULL,
XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT |
diff -r 896fcdd49c7f -r 684fdcfb251a tools/misc/xend
--- a/tools/misc/xend Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/misc/xend Mon Aug 28 16:26:37 2006 -0600
@@ -86,7 +86,7 @@ def start_xenstored():
XENSTORED_TRACE = os.getenv("XENSTORED_TRACE")
cmd = "xenstored --pid-file /var/run/xenstore.pid"
if XENSTORED_TRACE:
- cmd += " -T /var/log/xenstored-trace.log"
+ cmd += " -T /var/log/xen/xenstored-trace.log"
s,o = commands.getstatusoutput(cmd)
def start_consoled():
diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/util/bugtool.py
--- a/tools/python/xen/util/bugtool.py Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/python/xen/util/bugtool.py Mon Aug 28 16:26:37 2006 -0600
@@ -43,8 +43,8 @@ TITLE_RE = re.compile(r'<title>(.*)</tit
FILES_TO_SEND = [ '/var/log/' + x for x in
[ 'syslog', 'messages', 'debug',
- 'xend.log', 'xend-debug.log', 'xenstored-trace.log',
- 'xen-hotplug.log' ] ]
+ 'xen/xend.log', 'xen/xend-debug.log',
'xen/xenstored-trace.log',
+ 'xen/xen-hotplug.log' ] ]
#FILES_TO_SEND = [ ]
diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/xend/XendRoot.py
--- a/tools/python/xen/xend/XendRoot.py Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/python/xen/xend/XendRoot.py Mon Aug 28 16:26:37 2006 -0600
@@ -52,7 +52,7 @@ class XendRoot:
block_script_dir = "/etc/xen/scripts"
"""Default path to the log file. """
- logfile_default = "/var/log/xend.log"
+ logfile_default = "/var/log/xen/xend.log"
"""Default level of information to be logged."""
loglevel_default = 'DEBUG'
diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/xend/server/params.py
--- a/tools/python/xen/xend/server/params.py Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/python/xen/xend/server/params.py Mon Aug 28 16:26:37 2006 -0600
@@ -39,8 +39,8 @@ def getenv(var, val, conv=None):
# The following parameters could be placed in a configuration file.
XEND_PID_FILE = '/var/run/xend.pid'
-XEND_TRACE_FILE = '/var/log/xend.trace'
-XEND_DEBUG_LOG = '/var/log/xend-debug.log'
+XEND_TRACE_FILE = '/var/log/xen/xend.trace'
+XEND_DEBUG_LOG = '/var/log/xen/xend-debug.log'
XEND_USER = 'root'
XEND_DEBUG = getenv("XEND_DEBUG", 0, conv=int)
XEND_DAEMONIZE = getenv("XEND_DAEMONIZE", not XEND_DEBUG, conv=int)
diff -r 896fcdd49c7f -r 684fdcfb251a tools/security/python/xensec_gen/main.py
--- a/tools/security/python/xensec_gen/main.py Mon Aug 28 16:16:07 2006 -0600
+++ b/tools/security/python/xensec_gen/main.py Mon Aug 28 16:26:37 2006 -0600
@@ -34,7 +34,7 @@ import CGIHTTPServer
gHttpPort = 7777
gHttpDir = '/var/lib/xensec_gen'
-gLogFile = '/var/log/xensec_gen.log'
+gLogFile = '/var/log/xen/xensec_gen.log'
gUser = 'nobody'
gGroup = 'nobody'
diff -r 896fcdd49c7f -r 684fdcfb251a
unmodified_drivers/linux-2.6/platform-pci/evtchn.c
--- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Mon Aug 28
16:16:07 2006 -0600
+++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Mon Aug 28
16:26:37 2006 -0600
@@ -4,7 +4,7 @@
* A simplified event channel for para-drivers in unmodified linux
*
* Copyright (c) 2002-2005, K A Fraser
- * Copyright (c) 2005, <xiaofeng.ling@xxxxxxxxx>
+ * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@xxxxxxxxx>
*
* This file may be distributed separately from the Linux kernel, or
* incorporated into other software packages, subject to the following license:
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/Makefile Mon Aug 28 16:26:37 2006 -0600
@@ -2,6 +2,7 @@ subdir-y += cpu
subdir-y += cpu
subdir-y += genapic
subdir-y += hvm
+subdir-y += mm
subdir-y += oprofile
subdir-$(x86_32) += x86_32
@@ -41,23 +42,6 @@ obj-y += usercopy.o
obj-y += usercopy.o
obj-y += x86_emulate.o
-ifneq ($(pae),n)
-obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
-else
-obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
-endif
-
-obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
- shadow2_g2_on_s3.o
-
-guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst
shadow2_,,$(1))))))
-shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst
shadow2_,,$(1))))))
-shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
- -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
-
-shadow2_%.o: shadow2.c $(HDRS) Makefile
- $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
-
obj-$(crash_debug) += gdbstub.o
$(TARGET): $(TARGET)-syms boot/mkelf32
@@ -86,9 +70,6 @@ boot/mkelf32: boot/mkelf32.c
boot/mkelf32: boot/mkelf32.c
$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
-shadow_guest32.o: shadow.c
-shadow_guest32pae.o: shadow.c
-
.PHONY: clean
clean::
rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/domain.c Mon Aug 28 16:26:37 2006 -0600
@@ -200,12 +200,12 @@ int arch_domain_create(struct domain *d)
#endif /* __x86_64__ */
- shadow2_lock_init(d);
- for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
- INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]);
- INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist);
- INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse);
- INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows);
+ shadow_lock_init(d);
+ for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
+ INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
+ INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
+ INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
+ INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
if ( !is_idle_domain(d) )
{
@@ -236,7 +236,7 @@ int arch_domain_create(struct domain *d)
void arch_domain_destroy(struct domain *d)
{
- shadow2_final_teardown(d);
+ shadow_final_teardown(d);
free_xenheap_pages(
d->arch.mm_perdomain_pt,
@@ -342,10 +342,10 @@ int arch_set_info_guest(
}
}
- /* Shadow2: make sure the domain has enough shadow memory to
+ /* Shadow: make sure the domain has enough shadow memory to
* boot another vcpu */
- if ( shadow2_mode_enabled(d)
- && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) )
+ if ( shadow_mode_enabled(d)
+ && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
{
destroy_gdt(v);
return -ENOMEM;
@@ -357,8 +357,8 @@ int arch_set_info_guest(
/* Don't redo final setup */
set_bit(_VCPUF_initialised, &v->vcpu_flags);
- if ( shadow2_mode_enabled(d) )
- shadow2_update_paging_modes(v);
+ if ( shadow_mode_enabled(d) )
+ shadow_update_paging_modes(v);
update_cr3(v);
@@ -936,11 +936,11 @@ void domain_relinquish_resources(struct
for_each_vcpu ( d, v )
{
/* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
- * or sh2_update_paging_modes()) */
+ * or sh_update_paging_modes()) */
pfn = pagetable_get_pfn(v->arch.guest_table);
if ( pfn != 0 )
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
put_page(mfn_to_page(pfn));
else
put_page_and_type(mfn_to_page(pfn));
@@ -962,7 +962,7 @@ void domain_relinquish_resources(struct
hvm_relinquish_guest_resources(d);
/* Tear down shadow mode stuff. */
- shadow2_teardown(d);
+ shadow_teardown(d);
/*
* Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -981,18 +981,18 @@ void domain_relinquish_resources(struct
void arch_dump_domain_info(struct domain *d)
{
- if ( shadow2_mode_enabled(d) )
- {
- printk(" shadow2 mode: ");
- if ( d->arch.shadow2.mode & SHM2_enable )
+ if ( shadow_mode_enabled(d) )
+ {
+ printk(" shadow mode: ");
+ if ( d->arch.shadow.mode & SHM2_enable )
printk("enabled ");
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
printk("refcounts ");
- if ( shadow2_mode_log_dirty(d) )
+ if ( shadow_mode_log_dirty(d) )
printk("log_dirty ");
- if ( shadow2_mode_translate(d) )
+ if ( shadow_mode_translate(d) )
printk("translate ");
- if ( shadow2_mode_external(d) )
+ if ( shadow_mode_external(d) )
printk("external ");
printk("\n");
}
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/domain_build.c Mon Aug 28 16:26:37 2006 -0600
@@ -679,8 +679,8 @@ int construct_dom0(struct domain *d,
(void)alloc_vcpu(d, i, i);
/* Set up CR3 value for write_ptbase */
- if ( shadow2_mode_enabled(v->domain) )
- shadow2_update_paging_modes(v);
+ if ( shadow_mode_enabled(v->domain) )
+ shadow_update_paging_modes(v);
else
update_cr3(v);
@@ -791,8 +791,8 @@ int construct_dom0(struct domain *d,
new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
if ( opt_dom0_shadow )
- if ( shadow2_test_enable(d) == 0 )
- shadow2_update_paging_modes(v);
+ if ( shadow_test_enable(d) == 0 )
+ shadow_update_paging_modes(v);
if ( supervisor_mode_kernel )
{
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/domctl.c Mon Aug 28 16:26:37 2006 -0600
@@ -39,7 +39,7 @@ long arch_do_domctl(
d = find_domain_by_id(domctl->domain);
if ( d != NULL )
{
- ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl);
+ ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl);
put_domain(d);
copy_to_guest(u_domctl, domctl, 1);
}
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/hvm.c Mon Aug 28 16:26:37 2006 -0600
@@ -384,8 +384,8 @@ int hvm_copy(void *buf, unsigned long va
if (count > size)
count = size;
- gfn = shadow2_gva_to_gfn(v, vaddr);
- mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+ gfn = shadow_gva_to_gfn(v, vaddr);
+ mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn));
if (mfn == INVALID_MFN)
return 0;
@@ -539,7 +539,7 @@ void hvm_do_hypercall(struct cpu_user_re
return;
}
- if ( current->arch.shadow2.mode->guest_levels == 4 )
+ if ( current->arch.shadow.mode->guest_levels == 4 )
{
pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
pregs->rsi,
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/platform.c Mon Aug 28 16:26:37 2006 -0600
@@ -721,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
if (pvalid) {
if (hvm_paging_enabled(current))
- p->u.data = shadow2_gva_to_gpa(current, value);
+ p->u.data = shadow_gva_to_gpa(current, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
@@ -771,7 +771,7 @@ void send_mmio_req(
if (pvalid) {
if (hvm_paging_enabled(v))
- p->u.data = shadow2_gva_to_gpa(v, value);
+ p->u.data = shadow_gva_to_gpa(v, value);
else
p->u.pdata = (void *) value; /* guest VA == guest PA */
} else
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/svm/svm.c Mon Aug 28 16:26:37 2006 -0600
@@ -29,7 +29,7 @@
#include <xen/domain_page.h>
#include <asm/current.h>
#include <asm/io.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
#include <asm/regs.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
@@ -402,6 +402,50 @@ static inline int long_mode_do_msr_write
}
return 1;
}
+
+
+#define loaddebug(_v,_reg) \
+ __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
+#define savedebug(_v,_reg) \
+ __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r"
((_v)->debugreg[_reg]))
+
+
+static inline void svm_save_dr(struct vcpu *v)
+{
+ if (v->arch.hvm_vcpu.flag_dr_dirty)
+ {
+ /* clear the DR dirty flag and re-enable intercepts for DR accesses */
+ v->arch.hvm_vcpu.flag_dr_dirty = 0;
+ v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
+
+ savedebug(&v->arch.guest_context, 0);
+ savedebug(&v->arch.guest_context, 1);
+ savedebug(&v->arch.guest_context, 2);
+ savedebug(&v->arch.guest_context, 3);
+ }
+}
+
+
+static inline void __restore_debug_registers(struct vcpu *v)
+{
+ loaddebug(&v->arch.guest_context, 0);
+ loaddebug(&v->arch.guest_context, 1);
+ loaddebug(&v->arch.guest_context, 2);
+ loaddebug(&v->arch.guest_context, 3);
+}
+
+
+static inline void svm_restore_dr(struct vcpu *v)
+{
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+ if (!vmcb)
+ return;
+
+ if (unlikely(vmcb->dr7 & 0xFF))
+ __restore_debug_registers(v);
+}
+
static int svm_realmode(struct vcpu *v)
{
@@ -717,6 +761,7 @@ static void svm_ctxt_switch_from(struct
static void svm_ctxt_switch_from(struct vcpu *v)
{
svm_freeze_time(v);
+ svm_save_dr(v);
}
static void svm_ctxt_switch_to(struct vcpu *v)
@@ -732,6 +777,7 @@ static void svm_ctxt_switch_to(struct vc
set_segment_register(es, 0);
set_segment_register(ss, 0);
#endif
+ svm_restore_dr(v);
}
@@ -746,10 +792,10 @@ static void svm_final_setup_guest(struct
if ( v != d->vcpu[0] )
return;
- if ( !shadow2_mode_external(d) )
+ if ( !shadow_mode_external(d) )
{
DPRINTK("Can't init HVM for dom %u vcpu %u: "
- "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+ "not in shadow external mode\n", d->domain_id, v->vcpu_id);
domain_crash(d);
}
@@ -914,7 +960,7 @@ static int svm_do_page_fault(unsigned lo
va, eip, (unsigned long)regs->error_code);
//#endif
- result = shadow2_fault(va, regs);
+ result = shadow_fault(va, regs);
if( result ) {
/* Let's make sure that the Guest TLB is flushed */
@@ -1183,55 +1229,16 @@ static inline void set_reg(unsigned int
}
-static void svm_dr_access (struct vcpu *v, unsigned int reg, unsigned int type,
- struct cpu_user_regs *regs)
-{
- unsigned long *reg_p = 0;
- unsigned int gpreg = 0;
- unsigned long eip;
- int inst_len;
- int index;
- struct vmcb_struct *vmcb;
- u8 buffer[MAX_INST_LEN];
- u8 prefix = 0;
-
- vmcb = v->arch.hvm_svm.vmcb;
-
- ASSERT(vmcb);
-
- eip = vmcb->rip;
- inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
- index = skip_prefix_bytes(buffer, sizeof(buffer));
-
- ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
-
- if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
- prefix = buffer[index-1];
-
- gpreg = decode_src_reg(prefix, buffer[index + 2]);
- ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
-
- HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
- eip, reg, gpreg);
-
- reg_p = get_reg_p(gpreg, regs, vmcb);
-
- switch (type)
- {
- case TYPE_MOV_TO_DR:
- inst_len = __get_instruction_length(vmcb, INSTR_MOV2DR, buffer);
- v->arch.guest_context.debugreg[reg] = *reg_p;
- break;
- case TYPE_MOV_FROM_DR:
- inst_len = __get_instruction_length(vmcb, INSTR_MOVDR2, buffer);
- *reg_p = v->arch.guest_context.debugreg[reg];
- break;
- default:
- __hvm_bug(regs);
- break;
- }
- ASSERT(inst_len > 0);
- __update_guest_eip(vmcb, inst_len);
+static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
+{
+ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+ v->arch.hvm_vcpu.flag_dr_dirty = 1;
+
+ __restore_debug_registers(v);
+
+ /* allow the guest full access to the debug registers */
+ vmcb->dr_intercepts = 0;
}
@@ -1562,7 +1569,7 @@ static int svm_set_cr0(unsigned long val
v->arch.guest_table = pagetable_from_pfn(mfn);
if ( old_base_mfn )
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
@@ -1588,14 +1595,14 @@ static int svm_set_cr0(unsigned long val
svm_inject_exception(v, TRAP_gp_fault, 1, 0);
return 0;
}
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
}
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
/* we should take care of this kind of situation */
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
}
@@ -1706,7 +1713,7 @@ static int mov_to_cr(int gpreg, int cr,
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow2_update_cr3(v);
+ shadow_update_cr3(v);
}
else
{
@@ -1771,7 +1778,7 @@ static int mov_to_cr(int gpreg, int cr,
v->arch.guest_table = pagetable_from_pfn(mfn);
if ( old_base_mfn )
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
@@ -1808,7 +1815,7 @@ static int mov_to_cr(int gpreg, int cr,
if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
{
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
}
break;
}
@@ -2149,7 +2156,7 @@ void svm_handle_invlpg(const short invlp
/* Overkill, we may not this */
set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
- shadow2_invlpg(v, g_vaddr);
+ shadow_invlpg(v, g_vaddr);
}
@@ -2520,7 +2527,7 @@ void walk_shadow_and_guest_pt(unsigned l
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
unsigned long gpa;
- gpa = shadow2_gva_to_gpa(current, gva);
+ gpa = shadow_gva_to_gpa(current, gva);
printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
if( !svm_paging_enabled(v) || mmio_space(gpa) )
return;
@@ -2591,7 +2598,7 @@ asmlinkage void svm_vmexit_handler(struc
if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF)
{
if (svm_paging_enabled(v) &&
- !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
+ !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2)))
{
printk("I%08ld,ExC=%s(%d),IP=%x:%llx,"
"I1=%llx,I2=%llx,INT=%llx, "
@@ -2601,7 +2608,7 @@ asmlinkage void svm_vmexit_handler(struc
(unsigned long long) vmcb->exitinfo1,
(unsigned long long) vmcb->exitinfo2,
(unsigned long long) vmcb->exitintinfo.bytes,
- (unsigned long long) shadow2_gva_to_gpa(current,
vmcb->exitinfo2));
+ (unsigned long long) shadow_gva_to_gpa(current,
vmcb->exitinfo2));
}
else
{
@@ -2862,53 +2869,9 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_CR8_WRITE:
svm_cr_access(v, 8, TYPE_MOV_TO_CR, ®s);
break;
-
- case VMEXIT_DR0_READ:
- svm_dr_access(v, 0, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR1_READ:
- svm_dr_access(v, 1, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR2_READ:
- svm_dr_access(v, 2, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR3_READ:
- svm_dr_access(v, 3, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR6_READ:
- svm_dr_access(v, 6, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR7_READ:
- svm_dr_access(v, 7, TYPE_MOV_FROM_DR, ®s);
- break;
-
- case VMEXIT_DR0_WRITE:
- svm_dr_access(v, 0, TYPE_MOV_TO_DR, ®s);
- break;
-
- case VMEXIT_DR1_WRITE:
- svm_dr_access(v, 1, TYPE_MOV_TO_DR, ®s);
- break;
-
- case VMEXIT_DR2_WRITE:
- svm_dr_access(v, 2, TYPE_MOV_TO_DR, ®s);
- break;
-
- case VMEXIT_DR3_WRITE:
- svm_dr_access(v, 3, TYPE_MOV_TO_DR, ®s);
- break;
-
- case VMEXIT_DR6_WRITE:
- svm_dr_access(v, 6, TYPE_MOV_TO_DR, ®s);
- break;
-
- case VMEXIT_DR7_WRITE:
- svm_dr_access(v, 7, TYPE_MOV_TO_DR, ®s);
+
+ case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
+ svm_dr_access(v, ®s);
break;
case VMEXIT_IOIO:
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/svm/vmcb.c Mon Aug 28 16:26:37 2006 -0600
@@ -121,7 +121,7 @@ static int construct_vmcb_controls(struc
GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_RDTSCP;
/* read or write all debug registers 0 - 15 */
- vmcb->dr_intercepts = 0;
+ vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
/* RD/WR all control registers 0 - 15, but not read CR2 */
vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE);
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Mon Aug 28 16:26:37 2006 -0600
@@ -35,7 +35,7 @@
#include <xen/event.h>
#include <xen/kernel.h>
#include <xen/keyhandler.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
static int vmcs_size;
static int vmcs_order;
@@ -272,7 +272,7 @@ static void vmx_do_launch(struct vcpu *v
error |= __vmwrite(GUEST_TR_BASE, 0);
error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
__func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
__vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/hvm/vmx/vmx.c Mon Aug 28 16:26:37 2006 -0600
@@ -40,7 +40,7 @@
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vmcs.h>
#include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow2.h>
+#include <asm/shadow.h>
#include <public/sched.h>
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
@@ -66,10 +66,10 @@ static int vmx_initialize_guest_resource
if ( v->vcpu_id != 0 )
return 1;
- if ( !shadow2_mode_external(d) )
+ if ( !shadow_mode_external(d) )
{
DPRINTK("Can't init HVM for dom %u vcpu %u: "
- "not in shadow2 external mode\n",
+ "not in shadow external mode\n",
d->domain_id, v->vcpu_id);
domain_crash(d);
}
@@ -865,7 +865,7 @@ static int vmx_do_page_fault(unsigned lo
}
#endif
- result = shadow2_fault(va, regs);
+ result = shadow_fault(va, regs);
TRACE_VMEXIT (2,result);
#if 0
@@ -1039,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
* We do the safest things first, then try to update the shadow
* copying from guest
*/
- shadow2_invlpg(v, va);
+ shadow_invlpg(v, va);
}
@@ -1301,7 +1301,7 @@ vmx_world_restore(struct vcpu *v, struct
skip_cr3:
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
if (!vmx_paging_enabled(v))
HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
else
@@ -1504,7 +1504,7 @@ static int vmx_set_cr0(unsigned long val
v->arch.guest_table = pagetable_from_pfn(mfn);
if (old_base_mfn)
put_page(mfn_to_page(old_base_mfn));
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
(unsigned long) (mfn << PAGE_SHIFT));
@@ -1577,7 +1577,7 @@ static int vmx_set_cr0(unsigned long val
else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
{
__vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
}
return 1;
@@ -1662,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
if (mfn != pagetable_get_pfn(v->arch.guest_table))
__hvm_bug(regs);
- shadow2_update_cr3(v);
+ shadow_update_cr3(v);
} else {
/*
* If different, make a shadow. Check if the PDBR is valid
@@ -1755,7 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
* all TLB entries except global entries.
*/
if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- shadow2_update_paging_modes(v);
+ shadow_update_paging_modes(v);
break;
}
default:
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/mm.c Mon Aug 28 16:26:37 2006 -0600
@@ -454,12 +454,12 @@ int map_ldt_shadow_page(unsigned int off
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- if ( !res && unlikely(shadow2_mode_refcounts(d)) )
- {
- shadow2_lock(d);
- shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
+ if ( !res && unlikely(shadow_mode_refcounts(d)) )
+ {
+ shadow_lock(d);
+ shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
- shadow2_unlock(d);
+ shadow_unlock(d);
}
if ( unlikely(!res) )
@@ -527,7 +527,7 @@ get_linear_pagetable(
struct page_info *page;
unsigned long pfn;
- ASSERT( !shadow2_mode_refcounts(d) );
+ ASSERT( !shadow_mode_refcounts(d) );
if ( (root_get_flags(re) & _PAGE_RW) )
{
@@ -602,12 +602,12 @@ get_page_from_l1e(
d = dom_io;
}
- /* Foreign mappings into guests in shadow2 external mode don't
+ /* Foreign mappings into guests in shadow external mode don't
* contribute to writeable mapping refcounts. (This allows the
* qemu-dm helper process in dom0 to map the domain's memory without
* messing up the count of "real" writable mappings.) */
okay = (((l1e_get_flags(l1e) & _PAGE_RW) &&
- !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+ !(unlikely(shadow_mode_external(d) && (d != current->domain))))
? get_page_and_type(page, d, PGT_writable_page)
: get_page(page, d));
if ( !okay )
@@ -771,9 +771,9 @@ void put_page_from_l1e(l1_pgentry_t l1e,
}
/* Remember we didn't take a type-count of foreign writable mappings
- * to shadow2 external domains */
+ * to shadow external domains */
if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
- !(unlikely((e != d) && shadow2_mode_external(e))) )
+ !(unlikely((e != d) && shadow_mode_external(e))) )
{
put_page_and_type(page);
}
@@ -830,7 +830,7 @@ static int alloc_l1_table(struct page_in
l1_pgentry_t *pl1e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
pl1e = map_domain_page(pfn);
@@ -883,7 +883,7 @@ static int create_pae_xen_mappings(l3_pg
* a. alloc_l3_table() calls this function and this check will fail
* b. mod_l3_entry() disallows updates to slot 3 in an existing table
*
- * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
+ * XXX -- this needs revisiting for shadow_mode_refcount()==true...
*/
page = l3e_get_page(l3e3);
BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -1007,7 +1007,7 @@ static int alloc_l2_table(struct page_in
l2_pgentry_t *pl2e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
pl2e = map_domain_page(pfn);
@@ -1059,7 +1059,7 @@ static int alloc_l3_table(struct page_in
l3_pgentry_t *pl3e;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
#ifdef CONFIG_X86_PAE
/*
@@ -1120,7 +1120,7 @@ static int alloc_l4_table(struct page_in
unsigned long vaddr;
int i;
- ASSERT(!shadow2_mode_refcounts(d));
+ ASSERT(!shadow_mode_refcounts(d));
for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
{
@@ -1234,8 +1234,8 @@ static inline int update_l1e(l1_pgentry_
struct vcpu *v)
{
int rv = 1;
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
- shadow2_lock(v->domain);
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
+ shadow_lock(v->domain);
#ifndef PTE_UPDATE_WITH_CMPXCHG
rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
#else
@@ -1266,10 +1266,10 @@ static inline int update_l1e(l1_pgentry_
}
}
#endif
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
- {
- shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
- shadow2_unlock(v->domain);
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
+ {
+ shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+ shadow_unlock(v->domain);
}
return rv;
}
@@ -1339,13 +1339,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
#endif
#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \
int rv; \
- if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
- shadow2_lock(current->domain); \
+ if ( unlikely(shadow_mode_enabled(current->domain)) ) \
+ shadow_lock(current->domain); \
rv = _UPDATE_ENTRY(_t, _p, _o, _n); \
- if ( unlikely(shadow2_mode_enabled(current->domain)) ) \
+ if ( unlikely(shadow_mode_enabled(current->domain)) ) \
{ \
- shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \
- shadow2_unlock(current->domain); \
+ shadow_validate_guest_entry(current, _mfn(_m), (_p)); \
+ shadow_unlock(current->domain); \
} \
rv; \
})
@@ -1581,21 +1581,21 @@ void free_page_type(struct page_info *pa
*/
this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
- if ( unlikely(shadow2_mode_enabled(owner)
- && !shadow2_lock_is_acquired(owner)) )
+ if ( unlikely(shadow_mode_enabled(owner)
+ && !shadow_lock_is_acquired(owner)) )
{
/* Raw page tables are rewritten during save/restore. */
- if ( !shadow2_mode_translate(owner) )
+ if ( !shadow_mode_translate(owner) )
mark_dirty(owner, page_to_mfn(page));
- if ( shadow2_mode_refcounts(owner) )
+ if ( shadow_mode_refcounts(owner) )
return;
gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
ASSERT(VALID_M2P(gmfn));
- shadow2_lock(owner);
- shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
- shadow2_unlock(owner);
+ shadow_lock(owner);
+ shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+ shadow_unlock(owner);
}
}
@@ -1760,7 +1760,7 @@ int get_page_type(struct page_info *page
#endif
/* Fixme: add code to propagate va_unknown to subtables. */
if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
- !shadow2_mode_refcounts(page_get_owner(page)) )
+ !shadow_mode_refcounts(page_get_owner(page)) )
return 0;
/* This table is possibly mapped at multiple locations. */
nx &= ~PGT_va_mask;
@@ -1810,7 +1810,7 @@ int new_guest_cr3(unsigned long mfn)
if ( hvm_guest(v) && !hvm_paging_enabled(v) )
domain_crash_synchronous();
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
{
okay = get_page_from_pagenr(mfn, d);
if ( unlikely(!okay) )
@@ -1858,7 +1858,7 @@ int new_guest_cr3(unsigned long mfn)
if ( likely(old_base_mfn != 0) )
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
put_page(mfn_to_page(old_base_mfn));
else
put_page_and_type(mfn_to_page(old_base_mfn));
@@ -2043,7 +2043,7 @@ int do_mmuext_op(
type = PGT_root_page_table;
pin_page:
- if ( shadow2_mode_refcounts(FOREIGNDOM) )
+ if ( shadow_mode_refcounts(FOREIGNDOM) )
break;
okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -2065,7 +2065,7 @@ int do_mmuext_op(
break;
case MMUEXT_UNPIN_TABLE:
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
break;
if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2078,11 +2078,11 @@ int do_mmuext_op(
{
put_page_and_type(page);
put_page(page);
- if ( shadow2_mode_enabled(d) )
+ if ( shadow_mode_enabled(d) )
{
- shadow2_lock(d);
- shadow2_remove_all_shadows(v, _mfn(mfn));
- shadow2_unlock(d);
+ shadow_lock(d);
+ shadow_remove_all_shadows(v, _mfn(mfn));
+ shadow_unlock(d);
}
}
else
@@ -2125,8 +2125,8 @@ int do_mmuext_op(
break;
case MMUEXT_INVLPG_LOCAL:
- if ( !shadow2_mode_enabled(d)
- || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+ if ( !shadow_mode_enabled(d)
+ || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
local_flush_tlb_one(op.arg1.linear_addr);
break;
@@ -2173,7 +2173,7 @@ int do_mmuext_op(
unsigned long ptr = op.arg1.linear_addr;
unsigned long ents = op.arg2.nr_ents;
- if ( shadow2_mode_external(d) )
+ if ( shadow_mode_external(d) )
{
MEM_LOG("ignoring SET_LDT hypercall from external "
"domain %u", d->domain_id);
@@ -2319,7 +2319,7 @@ int do_mmu_update(
case PGT_l3_page_table:
case PGT_l4_page_table:
{
- if ( shadow2_mode_refcounts(d) )
+ if ( shadow_mode_refcounts(d) )
{
DPRINTK("mmu update on shadow-refcounted domain!");
break;
@@ -2372,16 +2372,16 @@ int do_mmu_update(
if ( unlikely(!get_page_type(page, PGT_writable_page)) )
break;
- if ( unlikely(shadow2_mode_enabled(d)) )
- shadow2_lock(d);
+ if ( unlikely(shadow_mode_enabled(d)) )
+ shadow_lock(d);
*(intpte_t *)va = req.val;
okay = 1;
- if ( unlikely(shadow2_mode_enabled(d)) )
+ if ( unlikely(shadow_mode_enabled(d)) )
{
- shadow2_validate_guest_entry(v, _mfn(mfn), va);
- shadow2_unlock(d);
+ shadow_validate_guest_entry(v, _mfn(mfn), va);
+ shadow_unlock(d);
}
put_page_type(page);
@@ -2405,8 +2405,8 @@ int do_mmu_update(
break;
}
- if ( shadow2_mode_translate(FOREIGNDOM) )
- shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+ if ( shadow_mode_translate(FOREIGNDOM) )
+ shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
else
set_gpfn_from_mfn(mfn, gpfn);
okay = 1;
@@ -2492,7 +2492,7 @@ static int create_grant_pte_mapping(
goto failed;
}
- if ( !shadow2_mode_refcounts(d) )
+ if ( !shadow_mode_refcounts(d) )
put_page_from_l1e(ol1e, d);
put_page_type(page);
@@ -2590,7 +2590,7 @@ static int create_grant_va_mapping(
l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
return GNTST_general_error;
- if ( !shadow2_mode_refcounts(d) )
+ if ( !shadow_mode_refcounts(d) )
put_page_from_l1e(ol1e, d);
return GNTST_okay;
@@ -2714,10 +2714,10 @@ int do_update_va_mapping(unsigned long v
perfc_incrc(calls_to_update_va);
- if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
+ if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
return -EINVAL;
- if ( unlikely(shadow2_mode_refcounts(d)) )
+ if ( unlikely(shadow_mode_refcounts(d)) )
{
DPRINTK("Grant op on a shadow-refcounted domain\n");
return -EINVAL;
@@ -2725,11 +2725,11 @@ int do_update_va_mapping(unsigned long v
LOCK_BIGLOCK(d);
- if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
+ if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
{
if ( unlikely(this_cpu(percpu_mm_info).foreign &&
- (shadow2_mode_translate(d) ||
- shadow2_mode_translate(
+ (shadow_mode_translate(d) ||
+ shadow_mode_translate(
this_cpu(percpu_mm_info).foreign))) )
{
/*
@@ -2770,8 +2770,8 @@ int do_update_va_mapping(unsigned long v
switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
{
case UVMF_LOCAL:
- if ( !shadow2_mode_enabled(d)
- || (shadow2_invlpg(current, va) != 0) )
+ if ( !shadow_mode_enabled(d)
+ || (shadow_invlpg(current, va) != 0) )
local_flush_tlb_one(va);
break;
case UVMF_ALL:
@@ -3006,7 +3006,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
break;
}
- if ( !shadow2_mode_translate(d) || (mfn == 0) )
+ if ( !shadow_mode_translate(d) || (mfn == 0) )
{
put_domain(d);
return -EINVAL;
@@ -3196,21 +3196,21 @@ static int ptwr_emulated_update(
pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
if ( do_cmpxchg )
{
- if ( shadow2_mode_enabled(d) )
- shadow2_lock(d);
+ if ( shadow_mode_enabled(d) )
+ shadow_lock(d);
ol1e = l1e_from_intpte(old);
if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
{
- if ( shadow2_mode_enabled(d) )
- shadow2_unlock(d);
+ if ( shadow_mode_enabled(d) )
+ shadow_unlock(d);
unmap_domain_page(pl1e);
put_page_from_l1e(nl1e, d);
return X86EMUL_CMPXCHG_FAILED;
}
- if ( unlikely(shadow2_mode_enabled(v->domain)) )
- {
- shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
- shadow2_unlock(v->domain);
+ if ( unlikely(shadow_mode_enabled(v->domain)) )
+ {
+ shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+ shadow_unlock(v->domain);
}
}
else
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/arch/x86/traps.c Mon Aug 28 16:26:37 2006 -0600
@@ -870,8 +870,8 @@ static int fixup_page_fault(unsigned lon
if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
{
- if ( shadow2_mode_external(d) && guest_mode(regs) )
- return shadow2_fault(addr, regs);
+ if ( shadow_mode_external(d) && guest_mode(regs) )
+ return shadow_fault(addr, regs);
if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
return handle_gdt_ldt_mapping_fault(
addr - GDT_LDT_VIRT_START, regs);
@@ -890,8 +890,8 @@ static int fixup_page_fault(unsigned lon
ptwr_do_page_fault(d, addr, regs) )
return EXCRET_fault_fixed;
- if ( shadow2_mode_enabled(d) )
- return shadow2_fault(addr, regs);
+ if ( shadow_mode_enabled(d) )
+ return shadow_fault(addr, regs);
return 0;
}
diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/include/asm-x86/domain.h Mon Aug 28 16:26:37 2006 -0600
@@ -59,10 +59,10 @@ extern void hypercall_page_initialise(st
struct shadow_domain {
u32 mode; /* flags to control shadow operation */
- spinlock_t lock; /* shadow2 domain lock */
+ spinlock_t lock; /* shadow domain lock */
int locker; /* processor which holds the lock */
const char *locker_function; /* Func that took it */
- struct list_head freelists[SHADOW2_MAX_ORDER + 1];
+ struct list_head freelists[SHADOW_MAX_ORDER + 1];
struct list_head p2m_freelist;
struct list_head p2m_inuse;
struct list_head toplevel_shadows;
@@ -70,10 +70,10 @@ struct shadow_domain {
unsigned int free_pages; /* number of pages on freelists */
unsigned int p2m_pages; /* number of pages in p2m map */
- /* Shadow2 hashtable */
- struct shadow2_hash_entry *hash_table;
- struct shadow2_hash_entry *hash_freelist;
- struct shadow2_hash_entry *hash_allocations;
+ /* Shadow hashtable */
+ struct shadow_hash_entry *hash_table;
+ struct shadow_hash_entry *hash_freelist;
+ struct shadow_hash_entry *hash_allocations;
int hash_walking; /* Some function is walking the hash table */
/* Shadow log-dirty bitmap */
@@ -107,7 +107,7 @@ struct arch_domain
/* Shadow-translated guest: Pseudophys base address of reserved area. */
unsigned long first_reserved_pfn;
- struct shadow_domain shadow2;
+ struct shadow_domain shadow;
/* Shadow translated domain: P2M mapping */
pagetable_t phys_table;
@@ -135,7 +135,7 @@ struct pae_l3_cache { };
struct shadow_vcpu {
/* Pointers to mode-specific entry points. */
- struct shadow2_paging_mode *mode;
+ struct shadow_paging_mode *mode;
/* Last MFN that we emulated a write to. */
unsigned long last_emulated_mfn;
/* HVM guest: paging enabled (CR0.PG)? */
@@ -201,7 +201,7 @@ struct arch_vcpu
/* Current LDT details. */
unsigned long shadow_ldt_mapcnt;
- struct shadow_vcpu shadow2;
+ struct shadow_vcpu shadow;
} __cacheline_aligned;
/* shorthands to improve code legibility */
diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h Mon Aug 28 16:26:37 2006 -0600
@@ -113,6 +113,51 @@ enum CRInterceptBits
CR_INTERCEPT_CR14_WRITE = 1 << 30,
CR_INTERCEPT_CR15_WRITE = 1 << 31,
};
+
+
+/* debug register intercepts */
+enum DRInterceptBits
+{
+ DR_INTERCEPT_DR0_READ = 1 << 0,
+ DR_INTERCEPT_DR1_READ = 1 << 1,
+ DR_INTERCEPT_DR2_READ = 1 << 2,
+ DR_INTERCEPT_DR3_READ = 1 << 3,
+ DR_INTERCEPT_DR4_READ = 1 << 4,
+ DR_INTERCEPT_DR5_READ = 1 << 5,
+ DR_INTERCEPT_DR6_READ = 1 << 6,
+ DR_INTERCEPT_DR7_READ = 1 << 7,
+ DR_INTERCEPT_DR8_READ = 1 << 8,
+ DR_INTERCEPT_DR9_READ = 1 << 9,
+ DR_INTERCEPT_DR10_READ = 1 << 10,
+ DR_INTERCEPT_DR11_READ = 1 << 11,
+ DR_INTERCEPT_DR12_READ = 1 << 12,
+ DR_INTERCEPT_DR13_READ = 1 << 13,
+ DR_INTERCEPT_DR14_READ = 1 << 14,
+ DR_INTERCEPT_DR15_READ = 1 << 15,
+ DR_INTERCEPT_DR0_WRITE = 1 << 16,
+ DR_INTERCEPT_DR1_WRITE = 1 << 17,
+ DR_INTERCEPT_DR2_WRITE = 1 << 18,
+ DR_INTERCEPT_DR3_WRITE = 1 << 19,
+ DR_INTERCEPT_DR4_WRITE = 1 << 20,
+ DR_INTERCEPT_DR5_WRITE = 1 << 21,
+ DR_INTERCEPT_DR6_WRITE = 1 << 22,
+ DR_INTERCEPT_DR7_WRITE = 1 << 23,
+ DR_INTERCEPT_DR8_WRITE = 1 << 24,
+ DR_INTERCEPT_DR9_WRITE = 1 << 25,
+ DR_INTERCEPT_DR10_WRITE = 1 << 26,
+ DR_INTERCEPT_DR11_WRITE = 1 << 27,
+ DR_INTERCEPT_DR12_WRITE = 1 << 28,
+ DR_INTERCEPT_DR13_WRITE = 1 << 29,
+ DR_INTERCEPT_DR14_WRITE = 1 << 30,
+ DR_INTERCEPT_DR15_WRITE = 1 << 31,
+};
+
+/* for lazy save/restore we'd like to intercept all DR writes */
+#define DR_INTERCEPT_ALL_WRITES \
+ (DR_INTERCEPT_DR0_WRITE|DR_INTERCEPT_DR1_WRITE|DR_INTERCEPT_DR2_WRITE \
+ |DR_INTERCEPT_DR3_WRITE|DR_INTERCEPT_DR4_WRITE|DR_INTERCEPT_DR5_WRITE \
+ |DR_INTERCEPT_DR6_WRITE|DR_INTERCEPT_DR7_WRITE)
+
enum VMEXIT_EXITCODE
{
diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/include/asm-x86/mm.h Mon Aug 28 16:26:37 2006 -0600
@@ -22,7 +22,7 @@ struct page_info
/* Each frame can be threaded onto a doubly-linked list. */
union {
struct list_head list;
- /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+ /* Shadow uses this field as an up-pointer in lower-level shadows */
paddr_t up;
};
@@ -59,7 +59,7 @@ struct page_info
/* Only used on guest pages with a shadow.
* Guest pages with a shadow must have a non-zero type count, so this
* does not conflict with the tlbflush timestamp. */
- u32 shadow2_flags;
+ u32 shadow_flags;
// XXX -- we expect to add another field here, to be used for min/max
// purposes, which is only used for shadow pages.
@@ -76,7 +76,7 @@ struct page_info
#define PGT_ldt_page (6U<<29) /* using this page in an LDT? */
#define PGT_writable_page (7U<<29) /* has writable mappings of this page? */
-#ifndef SHADOW2
+#ifndef SHADOW
#define PGT_l1_shadow PGT_l1_page_table
#define PGT_l2_shadow PGT_l2_page_table
#define PGT_l3_shadow PGT_l3_page_table
@@ -117,7 +117,7 @@ struct page_info
/* 16-bit count of uses of this frame as its current type. */
#define PGT_count_mask ((1U<<16)-1)
-#ifndef SHADOW2
+#ifndef SHADOW
#ifdef __x86_64__
#define PGT_high_mfn_shift 52
#define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift)
@@ -132,7 +132,7 @@ struct page_info
#define PGT_score_shift 23
#define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift)
#endif
-#endif /* SHADOW2 */
+#endif /* SHADOW */
/* Cleared when the owning guest 'frees' this page. */
#define _PGC_allocated 31
@@ -146,38 +146,38 @@ struct page_info
/* 29-bit count of references to this frame. */
#define PGC_count_mask ((1U<<29)-1)
-/* shadow2 uses the count_info on shadow pages somewhat differently */
-/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
-#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */
-#define PGC_SH2_min_shadow (1U<<28)
-#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
-#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
-#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
-#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
-#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
-#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
-#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
-#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
-#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
-#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
-#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
-#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
-#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
-#define PGC_SH2_max_shadow (13U<<28)
-#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */
-#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
-#define PGC_SH2_unused (15U<<28)
-
-#define PGC_SH2_type_mask (15U<<28)
-#define PGC_SH2_type_shift 28
-
-#define PGC_SH2_pinned (1U<<27)
-
-#define _PGC_SH2_log_dirty 26
-#define PGC_SH2_log_dirty (1U<<26)
+/* shadow uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SHF's in shadow.h */
+#define PGC_SH_none (0U<<28) /* on the shadow free list */
+#define PGC_SH_min_shadow (1U<<28)
+#define PGC_SH_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH_max_shadow (13U<<28)
+#define PGC_SH_p2m_table (14U<<28) /* in use as the p2m table */
+#define PGC_SH_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH_unused (15U<<28)
+
+#define PGC_SH_type_mask (15U<<28)
+#define PGC_SH_type_shift 28
+
+#define PGC_SH_pinned (1U<<27)
+
+#define _PGC_SH_log_dirty 26
+#define PGC_SH_log_dirty (1U<<26)
/* 26 bit ref count for shadow pages */
-#define PGC_SH2_count_mask ((1U<<26) - 1)
+#define PGC_SH_count_mask ((1U<<26) - 1)
/* We trust the slab allocator in slab.c, and our use of it. */
#define PageSlab(page) (1)
@@ -201,9 +201,9 @@ static inline u32 pickle_domptr(struct d
/* The order of the largest allocation unit we use for shadow pages */
#if CONFIG_PAGING_LEVELS == 2
-#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#define SHADOW_MAX_ORDER 0 /* Only ever need 4k allocations */
#else
-#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+#define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
#endif
#define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain))
@@ -227,7 +227,7 @@ extern int shadow_remove_all_write_acces
extern int shadow_remove_all_write_access(
struct domain *d, unsigned long gmfn, unsigned long mfn);
extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow2_mode_refcounts(struct domain *d);
+extern int _shadow_mode_refcounts(struct domain *d);
static inline void put_page(struct page_info *page)
{
@@ -259,7 +259,7 @@ static inline int get_page(struct page_i
unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
unlikely(d != _domain) ) /* Wrong owner? */
{
- if ( !_shadow2_mode_refcounts(domain) )
+ if ( !_shadow_mode_refcounts(domain) )
DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
PRtype_info "\n",
page_to_mfn(page), domain, unpickle_domptr(d),
@@ -345,11 +345,11 @@ int check_descriptor(struct desc_struct
#define mfn_to_gmfn(_d, mfn) \
- ( (shadow2_mode_translate(_d)) \
+ ( (shadow_mode_translate(_d)) \
? get_gpfn_from_mfn(mfn) \
: (mfn) )
-#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+#define gmfn_to_mfn(_d, gpfn) mfn_x(sh_gfn_to_mfn(_d, gpfn))
/*
diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/include/asm-x86/perfc_defn.h Mon Aug 28 16:26:37 2006 -0600
@@ -30,59 +30,59 @@ PERFCOUNTER_CPU(exception_fixed,
PERFCOUNTER_CPU(exception_fixed, "pre-exception fixed")
-/* Shadow2 counters */
-PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc")
-PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+/* Shadow counters */
+PERFCOUNTER_CPU(shadow_alloc, "calls to shadow_alloc")
+PERFCOUNTER_CPU(shadow_alloc_tlbflush, "shadow_alloc flushed TLBs")
/* STATUS counters do not reset when 'P' is hit */
-PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use")
-PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free")
-PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows")
-PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows")
-PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
-PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update")
-PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update")
-PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
-PERFCOUNTER_CPU(shadow2_fault_bail_not_present,
- "shadow2_fault guest not-present")
-PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
-PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor,
- "shadow2_fault guest U/S fault")
-PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
-PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
-PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
-PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio")
-PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault")
-PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate")
-PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
-PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
-PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
-PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
-PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup")
-PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
-PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
-PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
-PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert")
-PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete")
-PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access")
-PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3")
-PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low")
-PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force")
-PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings")
-PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force")
-PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
-PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
-PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page")
-PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer")
-PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force")
-PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed")
-PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables")
-PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
-PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+PERFSTATUS(shadow_alloc_count, "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow_free, "calls to shadow_free")
+PERFCOUNTER_CPU(shadow_prealloc_1, "shadow recycles old shadows")
+PERFCOUNTER_CPU(shadow_prealloc_2, "shadow recycles in-use shadows")
+PERFCOUNTER_CPU(shadow_linear_map_failed, "shadow hit read-only linear map")
+PERFCOUNTER_CPU(shadow_a_update, "shadow A bit update")
+PERFCOUNTER_CPU(shadow_ad_update, "shadow A&D bit update")
+PERFCOUNTER_CPU(shadow_fault, "calls to shadow_fault")
+PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow_fault_bail_not_present,
+ "shadow_fault guest not-present")
+PERFCOUNTER_CPU(shadow_fault_bail_nx, "shadow_fault guest NX fault")
+PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping, "shadow_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow_fault_bail_user_supervisor,
+ "shadow_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow_fault_emulate_read, "shadow_fault emulates a read")
+PERFCOUNTER_CPU(shadow_fault_emulate_write, "shadow_fault emulates a write")
+PERFCOUNTER_CPU(shadow_fault_emulate_failed, "shadow_fault emulator fails")
+PERFCOUNTER_CPU(shadow_fault_mmio, "shadow_fault handled as mmio")
+PERFCOUNTER_CPU(shadow_fault_fixed, "shadow_fault fixed fault")
+PERFCOUNTER_CPU(shadow_ptwr_emulate, "shadow causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow_validate_gl1e_calls, "calls to shadow_validate_gl1e")
+PERFCOUNTER_CPU(shadow_validate_gl2e_calls, "calls to shadow_validate_gl2e")
+PERFCOUNTER_CPU(shadow_validate_gl3e_calls, "calls to shadow_validate_gl3e")
+PERFCOUNTER_CPU(shadow_validate_gl4e_calls, "calls to shadow_validate_gl4e")
+PERFCOUNTER_CPU(shadow_hash_lookups, "calls to shadow_hash_lookup")
+PERFCOUNTER_CPU(shadow_hash_lookup_head, "shadow hash hit in bucket head")
+PERFCOUNTER_CPU(shadow_hash_lookup_miss, "shadow hash misses")
+PERFCOUNTER_CPU(shadow_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow_hash_inserts, "calls to shadow_hash_insert")
+PERFCOUNTER_CPU(shadow_hash_deletes, "calls to shadow_hash_delete")
+PERFCOUNTER_CPU(shadow_writeable, "shadow removes write access")
+PERFCOUNTER_CPU(shadow_writeable_h_1, "shadow writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_2, "shadow writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_3, "shadow writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow_writeable_h_4, "shadow writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow_writeable_bf, "shadow writeable brute-force")
+PERFCOUNTER_CPU(shadow_mappings, "shadow removes all mappings")
+PERFCOUNTER_CPU(shadow_mappings_bf, "shadow rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow_early_unshadow, "shadow unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow_early_unshadow_top, "shadow unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow_unshadow, "shadow unshadows a page")
+PERFCOUNTER_CPU(shadow_up_pointer, "shadow unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow_unshadow_bf, "shadow unshadow brute-force")
+PERFCOUNTER_CPU(shadow_get_page_fail, "shadow_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow_guest_walk, "shadow walks guest tables")
+PERFCOUNTER_CPU(shadow_walk_cache_hit, "shadow walk-cache hits")
+PERFCOUNTER_CPU(shadow_walk_cache_miss, "shadow walk-cache misses")
/*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h Mon Aug 28 16:16:07 2006 -0600
+++ b/xen/include/asm-x86/shadow.h Mon Aug 28 16:26:37 2006 -0600
@@ -1,7 +1,9 @@
/******************************************************************************
* include/asm-x86/shadow.h
*
- * Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -21,26 +23,608 @@
#ifndef _XEN_SHADOW_H
#define _XEN_SHADOW_H
-/* This file is just a wrapper around the new Shadow2 header,
- * providing names that must be defined in any shadow implementation. */
-
-#include <asm/shadow2.h>
+#include <public/domctl.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <asm/flushtlb.h>
/* How to make sure a page is not referred to in a shadow PT */
/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */
#define shadow_drop_references(_d, _p) \
- shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+ shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
#define shadow_sync_and_drop_references(_d, _p) \
- shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
-
-/* Whether we are translating the domain's frame numbers for it */
-#define shadow_mode_translate(d) shadow2_mode_translate(d)
-
-/* ...and if so, how to add and remove entries in the mapping */
+ shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+
+/* How to add and remove entries in the p2m mapping. */
#define guest_physmap_add_page(_d, _p, _m) \
- shadow2_guest_physmap_add_page((_d), (_p), (_m))
+ shadow_guest_physmap_add_page((_d), (_p), (_m))
#define guest_physmap_remove_page(_d, _p, _m ) \
- shadow2_guest_physmap_remove_page((_d), (_p), (_m))
+ shadow_guest_physmap_remove_page((_d), (_p), (_m))
+
+/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+
+#define SHM2_shift 10
+/* We're in one of the shadow modes */
+#define SHM2_enable (1U << SHM2_shift)
+/* Refcounts based on shadow tables instead of guest tables */
+#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift)
+/* Enable log dirty mode */
+#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift)
+/* Xen does p2m translation, not guest */
+#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift)
+/* Xen does not steal address space from the domain for its own booking;
+ * requires VT or similar mechanisms */
+#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift)
+
+#define shadow_mode_enabled(_d) ((_d)->arch.shadow.mode)
+#define shadow_mode_refcounts(_d) ((_d)->arch.shadow.mode & SHM2_refcounts)
+#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow.mode & SHM2_log_dirty)
+#define shadow_mode_translate(_d) ((_d)->arch.shadow.mode & SHM2_translate)
+#define shadow_mode_external(_d) ((_d)->arch.shadow.mode & SHM2_external)
+
+/* Xen traps & emulates all reads of all page table pages:
+ *not yet supported
+ */
+#define shadow_mode_trap_reads(_d) ({ (void)(_d); 0; })
+
+// flags used in the return value of the shadow_set_lXe() functions...
+#define SHADOW_SET_CHANGED 0x1
+#define SHADOW_SET_FLUSH 0x2
+#define SHADOW_SET_ERROR 0x4
+#define SHADOW_SET_L3PAE_RECOPY 0x8
+
+// How do we tell that we have a 32-bit PV guest in a 64-bit Xen?
+#ifdef __x86_64__
+#define pv_32bit_guest(_v) 0 // not yet supported
+#else
+#define pv_32bit_guest(_v) !hvm_guest(v)
+#endif
+
+/* The shadow lock.
+ *
+ * This lock is per-domain. It is intended to allow us to make atomic
+ * updates to the software TLB that the shadow tables provide.
+ *
+ * Specifically, it protects:
+ * - all changes to shadow page table pages
+ * - the shadow hash table
+ * - the shadow page allocator
+ * - all changes to guest page table pages; if/when the notion of
+ * out-of-sync pages is added to this code, then the shadow lock is
+ * protecting all guest page table pages which are not listed as
+ * currently as both guest-writable and out-of-sync...
+ * XXX -- need to think about this relative to writable page tables.
+ * - all changes to the page_info->tlbflush_timestamp
+ * - the page_info->count fields on shadow pages
+ * - the shadow dirty bit array and count
+ * - XXX
+ */
+#ifndef CONFIG_SMP
+#error shadow.h currently requires CONFIG_SMP
+#endif
+
+#define shadow_lock_init(_d) \
+ do { \
+ spin_lock_init(&(_d)->arch.shadow.lock); \
+ (_d)->arch.shadow.locker = -1; \
+ (_d)->arch.shadow.locker_function = "nobody"; \
+ } while (0)
+
+#define shadow_lock_is_acquired(_d) \
+ (current->processor == (_d)->arch.shadow.locker)
+
+#define shadow_lock(_d) \
+ do { \
+ if ( unlikely((_d)->arch.shadow.locker == current->processor) ) \
+ { \
+ printk("Error: shadow lock held by %s\n", \
+ (_d)->arch.shadow.locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_d)->arch.shadow.lock); \
+ ASSERT((_d)->arch.shadow.locker == -1); \
+ (_d)->arch.shadow.locker = current->processor; \
+ (_d)->arch.shadow.locker_function = __func__; \
+ } while (0)
+
+#define shadow_unlock(_d) \
+ do { \
+ ASSERT((_d)->arch.shadow.locker == current->processor); \
+ (_d)->arch.shadow.locker = -1; \
+ (_d)->arch.shadow.locker_function = "nobody"; \
+ spin_unlock(&(_d)->arch.shadow.lock); \
+ } while (0)
+
+/*
+ * Levels of self-test and paranoia
+ * XXX should go in config files somewhere?
+ */
+#define SHADOW_AUDIT_HASH 0x01 /* Check current hash bucket */
+#define SHADOW_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */
+#define SHADOW_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */
+#define SHADOW_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */
+#define SHADOW_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */
+#define SHADOW_AUDIT_P2M 0x20 /* Check the p2m table */
+
+#ifdef NDEBUG
+#define SHADOW_AUDIT 0
+#define SHADOW_AUDIT_ENABLE 0
+#else
+#define SHADOW_AUDIT 0x15 /* Basic audit of all except p2m. */
+#define SHADOW_AUDIT_ENABLE shadow_audit_enable
+extern int shadow_audit_enable;
+#endif
+
+/*
+ * Levels of optimization
+ * XXX should go in config files somewhere?
+ */
+#define SHOPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */
+#define SHOPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */
+
+#define SHADOW_OPTIMIZATIONS 0x03
+
+
+/* With shadow pagetables, the different kinds of address start
+ * to get get confusing.
+ *
+ * Virtual addresses are what they usually are: the addresses that are used
+ * to accessing memory while the guest is running. The MMU translates from
+ * virtual addresses to machine addresses.
+ *
+ * (Pseudo-)physical addresses are the abstraction of physical memory the
+ * guest uses for allocation and so forth. For the purposes of this code,
+ * we can largely ignore them.
+ *
+ * Guest frame numbers (gfns) are the entries that the guest puts in its
+ * pagetables. For normal paravirtual guests, they are actual frame numbers,
+ * with the translation done by the guest.
+ *
+ * Machine frame numbers (mfns) are the entries that the hypervisor puts
+ * in the shadow page tables.
+ *
+ * Elsewhere in the xen code base, the name "gmfn" is generally used to refer
+ * to a "machine frame number, from the guest's perspective", or in other
+ * words, pseudo-physical frame numbers. However, in the shadow code, the
+ * term "gmfn" means "the mfn of a guest page"; this combines naturally with
+ * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a
+ * guest L2 page), etc...
+ */
+
+/* With this defined, we do some ugly things to force the compiler to
+ * give us type safety between mfns and gfns and other integers.
+ * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions
+ * that translate beween int and foo_t.
+ *
+ * It does have some performance cost because the types now have
+ * a different storage attribute, so may not want it on all the time. */
+#ifndef NDEBUG
+#define TYPE_SAFETY 1
+#endif
+
+#ifdef TYPE_SAFETY
+#define TYPE_SAFE(_type,_name) \
+typedef struct { _type _name; } _name##_t; \
+static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
+static inline _type _name##_x(_name##_t n) { return n._name; }
+#else
+#define TYPE_SAFE(_type,_name) \
+typedef _type _name##_t; \
+static inline _name##_t _##_name(_type n) { return n; } \
+static inline _type _name##_x(_name##_t n) { return n; }
+#endif
+
+TYPE_SAFE(unsigned long,mfn)
+#define SH_PRI_mfn "05lx"
+
+static inline int
+valid_mfn(mfn_t m)
+{
+ return VALID_MFN(mfn_x(m));
+}
+
+static inline mfn_t
+pagetable_get_mfn(pagetable_t pt)
+{
+ return _mfn(pagetable_get_pfn(pt));
+}
+
+static inline pagetable_t
+pagetable_from_mfn(mfn_t mfn)
+{
+ return pagetable_from_pfn(mfn_x(mfn));
+}
+
+static inline int
+shadow_vcpu_mode_translate(struct vcpu *v)
+{
+ // Returns true if this VCPU needs to be using the P2M table to translate
+ // between GFNs and MFNs.
+ //
+ // This is true of translated HVM domains on a vcpu which has paging
+ // enabled. (HVM vcpu's with paging disabled are using the p2m table as
+ // its paging table, so no translation occurs in this case.)
+ //
+ return v->arch.shadow.hvm_paging_enabled;
+}
+
+
+/**************************************************************************/
+/* Mode-specific entry points into the shadow code */
+
+struct x86_emulate_ctxt;
+struct shadow_paging_mode {
+ int (*page_fault )(struct vcpu *v, unsigned long va,
+ struct cpu_user_regs *regs);
+ int (*invlpg )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va);
+ unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va);
+ void (*update_cr3 )(struct vcpu *v);
+ int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry, u32 size);
+ void (*detach_old_tables )(struct vcpu *v);
+ int (*x86_emulate_write )(struct vcpu *v, unsigned long va,
+ void *src, u32 bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt);
+ int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt);
+ mfn_t (*make_monitor_table )(struct vcpu *v);
+ void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+ int (*guess_wrmap )(struct vcpu *v,
+ unsigned long vaddr, mfn_t gmfn);
+#endif
+ /* For outsiders to tell what mode we're in */
+ unsigned int shadow_levels;
+ unsigned int guest_levels;
+};
+
+static inline int shadow_guest_paging_levels(struct vcpu *v)
+{
+ ASSERT(v->arch.shadow.mode != NULL);
+ return v->arch.shadow.mode->guest_levels;
+}
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Turning on shadow test mode */
+int shadow_test_enable(struct domain *d);
+
+/* Handler for shadow control ops: enabling and disabling shadow modes,
+ * and log-dirty bitmap ops all happen through here. */
+int shadow_domctl(struct domain *d,
+ xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
+
+/* Call when destroying a domain */
+void shadow_teardown(struct domain *d);
+
+/* Call once all of the references to the domain have gone away */
+void shadow_final_teardown(struct domain *d);
+
+
+/* Mark a page as dirty in the bitmap */
+void sh_do_mark_dirty(struct domain *d, mfn_t gmfn);
+static inline void mark_dirty(struct domain *d, unsigned long gmfn)
+{
+ if ( shadow_mode_log_dirty(d) )
+ {
+ shadow_lock(d);
+ sh_do_mark_dirty(d, _mfn(gmfn));
+ shadow_unlock(d);
+ }
+}
+
+/* Internal version, for when the shadow lock is already held */
+static inline void sh_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ ASSERT(shadow_lock_is_acquired(d));
+ if ( shadow_mode_log_dirty(d) )
+ sh_do_mark_dirty(d, gmfn);
+}
+
+static inline int
+shadow_fault(unsigned long va, struct cpu_user_regs *regs)
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+{
+ struct vcpu *v = current;
+ perfc_incrc(shadow_fault);
+ return v->arch.shadow.mode->page_fault(v, va, regs);
+}
+
+static inline int
+shadow_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ return v->arch.shadow.mode->invlpg(v, va);
+}
+
+static inline unsigned long
+shadow_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow.mode->gva_to_gpa(v, va);
+}
+
+static inline unsigned long
+shadow_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ return v->arch.shadow.mode->gva_to_gfn(v, va);
+}
+
+static inline void
+shadow_update_cr3(struct vcpu *v)
+/* Updates all the things that are derived from the guest's CR3.
+ * Called when the guest changes CR3. */
+{
+ shadow_lock(v->domain);
+ v->arch.shadow.mode->update_cr3(v);
+ shadow_unlock(v->domain);
+}
+
+
+/* Should be called after CR3 is updated.
+ * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
+ *
+ * Also updates other state derived from CR3 (vcpu->arch.guest_vtable,
+ * shadow_vtable, etc).
+ *
+ * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
+ * for HVM guests, arch.monitor_table and hvm's guest CR3.
+ *
+ * Update ref counts to shadow tables appropriately.
+ * For PAE, relocate L3 entries, if necessary, into low memory.
+ */
+static inline void update_cr3(struct vcpu *v)
+{
+ unsigned long cr3_mfn=0;
+
+ if ( shadow_mode_enabled(v->domain) )
+ {
+ shadow_update_cr3(v);
+ return;
+ }
+
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
+ else
+#endif
+ cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
+
+ make_cr3(v, cr3_mfn);
+}
+
+extern void sh_update_paging_modes(struct vcpu *v);
+
+/* Should be called to initialise paging structures if the paging mode
+ * has changed, and when bringing up a VCPU for the first time. */
+static inline void shadow_update_paging_modes(struct vcpu *v)
+{
+ ASSERT(shadow_mode_enabled(v->domain));
+ shadow_lock(v->domain);
+ sh_update_paging_modes(v);
+ shadow_unlock(v->domain);
+}
+
+static inline void
+shadow_detach_old_tables(struct vcpu *v)
+{
+ if ( v->arch.shadow.mode )
+ v->arch.shadow.mode->detach_old_tables(v);
+}
+
+static inline mfn_t
+shadow_make_monitor_table(struct vcpu *v)
+{
+ return v->arch.shadow.mode->make_monitor_table(v);
+}
+
+static inline void
+shadow_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ v->arch.shadow.mode->destroy_monitor_table(v, mmfn);
+}
+
+/* Validate a pagetable change from the guest and update the shadows. */
+extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *new_guest_entry);
+
+/* Update the shadows in response to a pagetable write from a HVM guest */
+extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size);
+
+/* Remove all writeable mappings of a guest frame from the shadows.
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access. */
+extern int shadow_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
+ unsigned int level,
+ unsigned long fault_addr);
+
+/* Remove all mappings of the guest mfn from the shadows.
+ * Returns non-zero if we need to flush TLBs. */
+extern int shadow_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
+
+void
+shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn);
+/* This is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+
+/* Remove all shadows of the guest mfn. */
+extern void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all);
+static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
+{
+ sh_remove_shadows(v, gmfn, 1);
+}
+
+/* Add a page to a domain */
+void
+shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/* Remove a page from a domain */
+void
+shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn);
+
+/*
+ * Definitions for the shadow_flags field in page_info.
+ * These flags are stored on *guest* pages...
+ * Bits 1-13 are encodings for the shadow types.
+ */
+#define PGC_SH_type_to_index(_type) ((_type) >> PGC_SH_type_shift)
+#define SHF_page_type_mask \
+ (((1u << (PGC_SH_type_to_index(PGC_SH_max_shadow) + 1u)) - 1u) - \
+ ((1u << PGC_SH_type_to_index(PGC_SH_min_shadow)) - 1u))
+
+#define SHF_L1_32 (1u << PGC_SH_type_to_index(PGC_SH_l1_32_shadow))
+#define SHF_FL1_32 (1u << PGC_SH_type_to_index(PGC_SH_fl1_32_shadow))
+#define SHF_L2_32 (1u << PGC_SH_type_to_index(PGC_SH_l2_32_shadow))
+#define SHF_L1_PAE (1u << PGC_SH_type_to_index(PGC_SH_l1_pae_shadow))
+#define SHF_FL1_PAE (1u << PGC_SH_type_to_index(PGC_SH_fl1_pae_shadow))
+#define SHF_L2_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2_pae_shadow))
+#define SHF_L2H_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2h_pae_shadow))
+#define SHF_L3_PAE (1u << PGC_SH_type_to_index(PGC_SH_l3_pae_shadow))
+#define SHF_L1_64 (1u << PGC_SH_type_to_index(PGC_SH_l1_64_shadow))
+#define SHF_FL1_64 (1u << PGC_SH_type_to_index(PGC_SH_fl1_64_shadow))
+#define SHF_L2_64 (1u << PGC_SH_type_to_index(PGC_SH_l2_64_shadow))
+#define SHF_L3_64 (1u << PGC_SH_type_to_index(PGC_SH_l3_64_shadow))
+#define SHF_L4_64 (1u << PGC_SH_type_to_index(PGC_SH_l4_64_shadow))
+
+/* Used for hysteresis when automatically unhooking mappings on fork/exit */
+#define SHF_unhooked_mappings (1u<<31)
+
+/*
+ * Allocation of shadow pages
+ */
+
+/* Return the minumum acceptable number of shadow pages a domain needs */
+unsigned int shadow_min_acceptable_pages(struct domain *d);
+
+/* Set the pool of shadow pages to the required number of MB.
+ * Input will be rounded up to at least min_acceptable_shadow_pages().
+ * Returns 0 for success, 1 for failure. */
+unsigned int shadow_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted);
+
+/* Return the size of the shadow pool, rounded up to the nearest MB */
+static inline unsigned int shadow_get_allocation(struct domain *d)
+{
+ unsigned int pg = d->arch.shadow.total_pages;
+ return ((pg >> (20 - PAGE_SHIFT))
+ + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
+}
+
+/*
+ * Linked list for chaining entries in the shadow hash table.
+ */
+struct shadow_hash_entry {
+ struct shadow_hash_entry *next;
+ mfn_t smfn; /* MFN of the shadow */
+#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */
+ unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */
+#else
+ unsigned long n; /* MFN of guest PT or GFN of guest superpage */
+#endif
+ unsigned char t; /* shadow type bits, or 0 for empty */
+};
+
+#define SHADOW_HASH_BUCKETS 251
+/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
+
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_CACHE_WALKS
+/* Optimization: cache the results of guest walks. This helps with MMIO
+ * and emulated writes, which tend to issue very similar walk requests
+ * repeatedly. We keep the results of the last few walks, and blow
+ * away the cache on guest cr3 write, mode change, or page fault. */
+
+#define SH_WALK_CACHE_ENTRIES 4
+
+/* Rather than cache a guest walk, which would include mapped pointers
+ * to pages, we cache what a TLB would remember about the walk: the
+ * permissions and the l1 gfn */
+struct shadow_walk_cache {
+ unsigned long va; /* The virtual address (or 0 == unused) */
+ unsigned long gfn; /* The gfn from the effective l1e */
+ u32 permissions; /* The aggregated permission bits */
+};
+#endif
+
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Walk another domain's P2M table, mapping pages as we go */
+extern mfn_t
+sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
+
+
+/* General conversion function from gfn to mfn */
+static inline mfn_t
+sh_gfn_to_mfn(struct domain *d, unsigned long gfn)
+{
+ if ( !shadow_mode_translate(d) )
+ return _mfn(gfn);
+ else if ( likely(current->domain == d) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ else
+ return sh_gfn_to_mfn_foreign(d, gfn);
+}
+
+// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty
+// little secret that, for hvm guests with paging disabled, nearly all of the
+// shadow code actually think that the guest is running on *untranslated* page
+// tables (which is actually domain->phys_table).
+//
+static inline mfn_t
+sh_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn)
+{
+ if ( !shadow_vcpu_mode_translate(v) )
+ return _mfn(gfn);
+ if ( likely(current->domain == v->domain) )
+ return _mfn(get_mfn_from_gpfn(gfn));
+ return sh_gfn_to_mfn_foreign(v->domain, gfn);
+}
+
+static inline unsigned long
+sh_mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+ if ( shadow_mode_translate(d) )
+ return get_gpfn_from_mfn(mfn_x(mfn));
+ else
+ return mfn_x(mfn);
+}
+
+
#endif /* _XEN_SHADOW_H */
@@ -49,7 +633,7 @@
* mode: C
* c-set-style: "BSD"
* c-basic-offset: 4
- * tab-width: 4
* indent-tabs-mode: nil
* End:
*/
+
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/Makefile Mon Aug 28 16:26:37 2006 -0600
@@ -0,0 +1,1 @@
+subdir-y += shadow
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/shadow/Makefile Mon Aug 28 16:26:37 2006 -0600
@@ -0,0 +1,15 @@
+ifneq ($(pae),n)
+obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o
+else
+obj-$(x86_32) += common.o g2_on_s2.o
+endif
+
+obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o
+
+guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1)))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1)))))
+shadow_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+ -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+g%.o: multi.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/common.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/shadow/common.c Mon Aug 28 16:26:37 2006 -0600
@@ -0,0 +1,3407 @@
+/******************************************************************************
+ * arch/x86/mm/shadow/common.c
+ *
+ * Shadow code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define SHADOW 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <xen/keyhandler.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow.h>
+#include "private.h"
+
+#if SHADOW_AUDIT
+int shadow_audit_enable = 0;
+
+static void shadow_audit_key(unsigned char key)
+{
+ shadow_audit_enable = !shadow_audit_enable;
+ printk("%s shadow_audit_enable=%d\n",
+ __func__, shadow_audit_enable);
+}
+
+static int __init shadow_audit_key_init(void)
+{
+ register_keyhandler(
+ 'O', shadow_audit_key, "toggle shadow audits");
+ return 0;
+}
+__initcall(shadow_audit_key_init);
+#endif /* SHADOW_AUDIT */
+
+static void sh_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow_mode_refcounts(struct domain *d)
+{
+ return shadow_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow code
+ */
+
+static int
+sh_x86_emulate_read_std(unsigned long addr,
+ unsigned long *val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+ if ( hvm_guest(v) )
+ {
+ *val = 0;
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that is only a user vs supervisor access check.
+ //
+ if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+ {
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id,
+ addr, *val, bytes);
+#endif
+ return X86EMUL_CONTINUE;
+ }
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating. */
+ SHADOW_PRINTK("read failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_write_std(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ // XXX -- this is WRONG.
+ // It entirely ignores the permissions in the page tables.
+ // In this case, that includes user vs supervisor, and
+ // write access.
+ //
+ if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+ return X86EMUL_CONTINUE;
+
+ /* If we got here, there was nothing mapped here, or a bad GFN
+ * was mapped here. This should never happen: we're here because
+ * of a write fault at the end of the instruction we're emulating,
+ * which should be handled by sh_x86_emulate_write_emulated. */
+ SHADOW_PRINTK("write failed to va %#lx\n", addr);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_write_emulated(unsigned long addr,
+ unsigned long val,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes,
ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
+ unsigned long old,
+ unsigned long new,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+ v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
+ bytes, ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int
+sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct vcpu *v = current;
+#if 0
+ SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+ v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+ new_hi, new_lo, ctxt);
+#endif
+ if ( hvm_guest(v) )
+ {
+ return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo,
old_hi,
+ new_lo, new_hi, ctxt);
+ }
+ else
+ {
+ SHADOW_PRINTK("this operation is not emulated yet\n");
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+
+struct x86_emulate_ops shadow_emulator_ops = {
+ .read_std = sh_x86_emulate_read_std,
+ .write_std = sh_x86_emulate_write_std,
+ .read_emulated = sh_x86_emulate_read_std,
+ .write_emulated = sh_x86_emulate_write_emulated,
+ .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated,
+ .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table. This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ unsigned long type_info;
+
+ ASSERT(valid_mfn(gmfn));
+
+ /* We should never try to promote a gmfn that has writeable mappings */
+ ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
+
+ // Is the page already shadowed?
+ if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+ {
+ // No prior shadow exists...
+
+ // Grab a type-ref. We don't really care if we are racing with another
+ // vcpu or not, or even what kind of type we get; we just want the type
+ // count to be > 0.
+ //
+ do {
+ type_info =
+ page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+ } while ( !get_page_type(page, type_info) );
+
+ // Now that the type ref is non-zero, we can safely use the
+ // shadow_flags.
+ //
+ page->shadow_flags = 0;
+ }
+
+ ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
+ set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
+}
+
+void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+
+ ASSERT(test_bit(_PGC_page_table, &page->count_info));
+ ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
+
+ clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
+
+ if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+ {
+ // release the extra type ref
+ put_page_type(page);
+
+ // clear the is-a-page-table bit.
+ clear_bit(_PGC_page_table, &page->count_info);
+ }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW_SET_* flags. */
+
+static int
+__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+{
+ int result = 0;
+ struct page_info *page = mfn_to_page(gmfn);
+
+ sh_mark_dirty(v->domain, gmfn);
+
+ // Determine which types of shadows are affected, and update each.
+ //
+ // Always validate L1s before L2s to prevent another cpu with a linear
+ // mapping of this gmfn from seeing a walk that results from
+ // using the new L2 value and the old L1 value. (It is OK for such a
+ // guest to see a walk that uses the old L2 value with the new L1 value,
+ // as hardware could behave this way if one level of the pagewalk occurs
+ // before the store, and the next level of the pagewalk occurs after the
+ // store.
+ //
+ // Ditto for L2s before L3s, etc.
+ //
+
+ if ( !(page->count_info & PGC_page_table) )
+ return 0; /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow_flags & SHF_L1_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow_flags & SHF_L1_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+ if ( page->shadow_flags & SHF_L2_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
+ (v, gmfn, entry, size);
+#else
+ if ( page->shadow_flags & SHF_L2_32 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
+ (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( page->shadow_flags & SHF_L1_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2H_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L3_PAE )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
+ (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+ ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( page->shadow_flags & SHF_L1_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L2_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L3_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
+ (v, gmfn, entry, size);
+ if ( page->shadow_flags & SHF_L4_64 )
+ result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
+ (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+ ASSERT((page->shadow_flags
+ & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
+#endif
+
+ return result;
+}
+
+
+int
+shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+ int rc;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+ shadow_audit_tables(v);
+ return rc;
+}
+
+void
+shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+ void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+ struct domain *d = v->domain;
+ int rc;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
+ if ( rc & SHADOW_SET_FLUSH )
+ {
+ // Flush everyone except the local processor, which will flush when it
+ // re-enters the HVM guest.
+ //
+ cpumask_t mask = d->domain_dirty_cpumask;
+ cpu_clear(v->processor, mask);
+ flush_tlb_mask(mask);
+ }
+ if ( rc & SHADOW_SET_ERROR )
+ {
+ /* This page is probably not a pagetable any more: tear it out of the
+ * shadows, along with any tables that reference it */
+ shadow_remove_all_shadows_and_parents(v, gmfn);
+ }
+ /* We ignore the other bits: since we are about to change CR3 on
+ * VMENTER we don't need to do any extra TLB flushes. */
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow.table).
+ *
+ * The top bits hold the shadow type and the pinned bit. Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock. We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ *
+ * type_info use depends on the shadow type (from count_info)
+ *
+ * PGC_SH_none : This page is in the shadow free pool. type_info holds
+ * the chunk order for our freelist allocator.
+ *
+ * PGC_SH_l*_shadow : This page is in use as a shadow. type_info
+ * holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
+ * type_info holds the gfn being shattered.
+ *
+ * PGC_SH_monitor_table : This page is part of a monitor table.
+ * type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set. This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow_flags field
+ * ----------------------------------
+ *
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ *
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows).
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we
+ * need to allocate multiple pages together.
+ *
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ * A 32-bit guest l1 table covers 4MB of virtuial address space,
+ * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ * of virtual address space each). Similarly, a 32-bit guest l2 table
+ * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va
+ * each). These multi-page shadows are contiguous and aligned;
+ * functions for handling offsets into them are defined in shadow.c
+ * (shadow_l1_index() etc.)
+ *
+ * 2: Shadowing PAE top-level pages. Each guest page that contains
+ * any PAE top-level pages requires two shadow pages to shadow it.
+ * They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging 32b pae pae 64b 64b 64b
+ * Guest paging 32b 32b pae 32b pae 64b
+ * PV or HVM * HVM * HVM HVM *
+ * Shadow paging 32b pae pae pae pae 64b
+ *
+ * sl1 size 4k 8k 4k 8k 4k 4k
+ * sl2 size 4k 16k 4k 16k 4k 4k
+ * sl3 size - - 8k - 8k 4k
+ * sl4 size - - - - - 4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator. Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide
+ * a function for the p2m management to steal pages, in max-order chunks, from
+ * the free pool. We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu. We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow_min_acceptable_pages(struct domain *d)
+{
+ u32 vcpu_count = 0;
+ struct vcpu *v;
+
+ for_each_vcpu(d, v)
+ vcpu_count++;
+
+ return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH_SET_PFN_ORDER(_p, _o) \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type)
+{
+#if CONFIG_PAGING_LEVELS > 2
+ static const u32 type_to_order[16] = {
+ 0, /* PGC_SH_none */
+ 1, /* PGC_SH_l1_32_shadow */
+ 1, /* PGC_SH_fl1_32_shadow */
+ 2, /* PGC_SH_l2_32_shadow */
+ 0, /* PGC_SH_l1_pae_shadow */
+ 0, /* PGC_SH_fl1_pae_shadow */
+ 0, /* PGC_SH_l2_pae_shadow */
+ 0, /* PGC_SH_l2h_pae_shadow */
+ 1, /* PGC_SH_l3_pae_shadow */
+ 0, /* PGC_SH_l1_64_shadow */
+ 0, /* PGC_SH_fl1_64_shadow */
+ 0, /* PGC_SH_l2_64_shadow */
+ 0, /* PGC_SH_l3_64_shadow */
+ 0, /* PGC_SH_l4_64_shadow */
+ 2, /* PGC_SH_p2m_table */
+ 0 /* PGC_SH_monitor_table */
+ };
+ u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
+ return type_to_order[type];
+#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+ return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+ int i;
+
+ for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow.freelists[i]) )
+ return 1;
+ return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
+ {
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
+#else
+ SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
+ break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
+ break;
+#endif
+ default:
+ SHADOW_PRINTK("top-level shadow has bad type %08lx\n",
+ (unsigned long)((pg->count_info & PGC_SH_type_mask)
+ >> PGC_SH_type_shift));
+ BUG();
+ }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow_alloc(). Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow_prealloc(struct domain *d, unsigned int order)
+{
+ /* Need a vpcu for calling unpins; for now, since we don't have
+ * per-vcpu shadows, any will do */
+ struct vcpu *v = d->vcpu[0];
+ struct list_head *l, *t;
+ struct page_info *pg;
+ mfn_t smfn;
+
+ if ( chunk_is_available(d, order) ) return;
+
+ /* Stage one: walk the list of top-level pages, unpinning them */
+ perfc_incrc(shadow_prealloc_1);
+ list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
+ {
+ /* For PAE, we need to unpin each subshadow on this shadow */
+ SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
+ }
+ else
+#endif /* 32-bit code always takes this branch */
+ {
+ /* Unpin this top-level shadow */
+ sh_unpin(v, smfn);
+ }
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Stage two: all shadow pages are in use in hierarchies that are
+ * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
+ * mappings. */
+ perfc_incrc(shadow_prealloc_2);
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+ /* Walk the list from the tail: recently used toplevels have been pulled
+ * to the head */
+ list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ smfn = page_to_mfn(pg);
+ shadow_unhook_mappings(v, smfn);
+
+ /* Need to flush TLB if we've altered our own tables */
+ if ( !shadow_mode_external(d)
+ && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+ local_flush_tlb();
+
+ /* See if that freed up a chunk of appropriate size */
+ if ( chunk_is_available(d, order) ) return;
+ }
+
+ /* Nothing more we can do: all remaining shadows are of pages that
+ * hold Xen mappings for some vcpu. This can never happen. */
+ SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
+ " shadow pages total = %u, free = %u, p2m=%u\n",
+ 1 << order,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos.
+ * Never fails to allocate. */
+mfn_t shadow_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer)
+{
+ struct page_info *pg = NULL;
+ unsigned int order = shadow_order(shadow_type);
+ cpumask_t mask;
+ void *p;
+ int i;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(order <= SHADOW_MAX_ORDER);
+ ASSERT(shadow_type != PGC_SH_none);
+ perfc_incrc(shadow_alloc);
+
+ /* Find smallest order which can satisfy the request. */
+ for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
+ if ( !list_empty(&d->arch.shadow.freelists[i]) )
+ {
+ pg = list_entry(d->arch.shadow.freelists[i].next,
+ struct page_info, list);
+ list_del(&pg->list);
+
+ /* We may have to halve the chunk a number of times. */
+ while ( i != order )
+ {
+ i--;
+ SH_SET_PFN_ORDER(pg, i);
+ list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
+ pg += 1 << i;
+ }
+ d->arch.shadow.free_pages -= 1 << order;
+
+ /* Init page info fields and clear the pages */
+ for ( i = 0; i < 1<<order ; i++ )
+ {
+ pg[i].u.inuse.type_info = backpointer;
+ pg[i].count_info = shadow_type;
+ pg[i].shadow_flags = 0;
+ INIT_LIST_HEAD(&pg[i].list);
+ /* Before we overwrite the old contents of this page,
+ * we need to be sure that no TLB holds a pointer to it. */
+ mask = d->domain_dirty_cpumask;
+ tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+ if ( unlikely(!cpus_empty(mask)) )
+ {
+ perfc_incrc(shadow_alloc_tlbflush);
+ flush_tlb_mask(mask);
+ }
+ /* Now safe to clear the page for reuse */
+ p = sh_map_domain_page(page_to_mfn(pg+i));
+ ASSERT(p != NULL);
+ clear_page(p);
+ sh_unmap_domain_page(p);
+ perfc_incr(shadow_alloc_count);
+ }
+ return page_to_mfn(pg);
+ }
+
+ /* If we get here, we failed to allocate. This should never happen.
+ * It means that we didn't call shadow_prealloc() correctly before
+ * we allocated. We can't recover by calling prealloc here, because
+ * we might free up higher-level pages that the caller is working on. */
+ SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+ BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow_free(struct domain *d, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 shadow_type;
+ unsigned long order;
+ unsigned long mask;
+ int i;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ perfc_incrc(shadow_free);
+
+ shadow_type = pg->count_info & PGC_SH_type_mask;
+ ASSERT(shadow_type != PGC_SH_none);
+ ASSERT(shadow_type != PGC_SH_p2m_table);
+ order = shadow_order(shadow_type);
+
+ d->arch.shadow.free_pages += 1 << order;
+
+ for ( i = 0; i < 1<<order; i++ )
+ {
+ /* Strip out the type: this is now a free shadow page */
+ pg[i].count_info = 0;
+ /* Remember the TLB timestamp so we will know whether to flush
+ * TLBs when we reuse the page. Because the destructors leave the
+ * contents of the pages in place, we can delay TLB flushes until
+ * just before the allocator hands the page out again. */
+ pg[i].tlbflush_timestamp = tlbflush_current_time();
+ perfc_decr(shadow_alloc_count);
+ }
+
+ /* Merge chunks as far as possible. */
+ while ( order < SHADOW_MAX_ORDER )
+ {
+ mask = 1 << order;
+ if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+ /* Merge with predecessor block? */
+ if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none)
+ || (SH_PFN_ORDER(pg-mask) != order) )
+ break;
+ list_del(&(pg-mask)->list);
+ pg -= mask;
+ } else {
+ /* Merge with successor block? */
+ if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
+ || (SH_PFN_ORDER(pg+mask) != order) )
+ break;
+ list_del(&(pg+mask)->list);
+ }
+ order++;
+ }
+
+ SH_SET_PFN_ORDER(pg, order);
+ list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow_alloc_p2m_pages(struct domain *d)
+{
+ struct page_info *pg;
+ u32 i;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ if ( d->arch.shadow.total_pages
+ < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
+ return 0; /* Not enough shadow memory: need to increase it first */
+
+ pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
+ d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
+ d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
+ for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
+ {
+ /* Unlike shadow pages, mark p2m pages as owned by the domain */
+ page_set_owner(&pg[i], d);
+ list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
+ }
+ return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow_alloc_p2m_page(struct domain *d)
+{
+ struct list_head *entry;
+ mfn_t mfn;
+ void *p;
+
+ if ( list_empty(&d->arch.shadow.p2m_freelist) &&
+ !shadow_alloc_p2m_pages(d) )
+ return _mfn(0);
+ entry = d->arch.shadow.p2m_freelist.next;
+ list_del(entry);
+ list_add_tail(entry, &d->arch.shadow.p2m_inuse);
+ mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+ sh_get_ref(mfn, 0);
+ p = sh_map_domain_page(mfn);
+ clear_page(p);
+ sh_unmap_domain_page(p);
+
+ return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d,
+ l3_pgentry_t *l3e)
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table. Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table. This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+ l2_pgentry_t *ml2e;
+ struct vcpu *v;
+ unsigned int index;
+
+ index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+ ASSERT(index < MACHPHYS_MBYTES>>1);
+
+ for_each_vcpu(d, v)
+ {
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ continue;
+ ASSERT(shadow_mode_external(v->domain));
+
+ SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+ d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+ if ( v == current ) /* OK to use linear map of monitor_table */
+ ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+ else
+ {
+ l3_pgentry_t *ml3e;
+ ml3e =
sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+ ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+ ml2e += l2_table_offset(RO_MPT_VIRT_START);
+ sh_unmap_domain_page(ml3e);
+ }
+ ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+ if ( v != current )
+ sh_unmap_domain_page(ml2e);
+ }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, u32 shift, u32 max)
+{
+ u32 index;
+
+ index = *gfn_remainder >> shift;
+ if ( index >= max )
+ {
+ SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
+ "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+ gfn, *gfn_remainder, shift, index, max);
+ return NULL;
+ }
+ *gfn_remainder &= (1 << shift) - 1;
+ return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
+ unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
+ u32 max, unsigned long type)
+{
+ l1_pgentry_t *p2m_entry;
+ void *next;
+
+ if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+ shift, max)) )
+ return 0;
+
+ if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+ {
+ mfn_t mfn = shadow_alloc_p2m_page(d);
+ if ( mfn_x(mfn) == 0 )
+ return 0;
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+ mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+ if (type == PGT_l2_page_table)
+ {
+ /* We have written to the p2m l3: need to sync the per-vcpu
+ * copies of it in the monitor tables */
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+ }
+#endif
+ /* The P2M can be shadowed: keep the shadows synced */
+ if ( d->vcpu[0] )
+ (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+ }
+ *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+ next = sh_map_domain_page(*table_mfn);
+ sh_unmap_domain_page(*table);
+ *table = next;
+
+ return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+ // XXX -- this might be able to be faster iff current->domain == d
+ mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+ void *table = sh_map_domain_page(table_mfn);
+ unsigned long gfn_remainder = gfn;
+ l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+ return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ // When using PAE Xen, we only allow 33 bits of pseudo-physical
+ // address in translated guests (i.e. 8 GBytes). This restriction
+ // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+ // in Xen's address space for translated PV guests.
+ //
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ (CONFIG_PAGING_LEVELS == 3
+ ? 8
+ : L3_PAGETABLE_ENTRIES),
+ PGT_l2_page_table) )
+ return 0;
+#endif
+ if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+ return 0;
+
+ p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+ 0, L1_PAGETABLE_ENTRIES);
+ ASSERT(p2m_entry);
+ if ( valid_mfn(mfn) )
+ *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+ else
+ *p2m_entry = l1e_empty();
+
+ /* The P2M can be shadowed: keep the shadows synced */
+ (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn,
+ p2m_entry, sizeof *p2m_entry);
+
+ sh_unmap_domain_page(table);
+
+ return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow_alloc_p2m_table(struct domain *d)
+{
+ mfn_t p2m_top;
+ struct list_head *entry;
+ unsigned int page_count = 0;
+
+ SHADOW_PRINTK("allocating p2m table\n");
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+ p2m_top = shadow_alloc_p2m_page(d);
+ mfn_to_page(p2m_top)->count_info = 1;
+ mfn_to_page(p2m_top)->u.inuse.type_info =
+#if CONFIG_PAGING_LEVELS == 4
+ PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+ PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+ PGT_l2_page_table
+#endif
+ | 1 | PGT_validated;
+
+ if ( mfn_x(p2m_top) == 0 )
+ return 0;
+
+ d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+ SHADOW_PRINTK("populating p2m table\n");
+
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ struct page_info *page = list_entry(entry, struct page_info, list);
+ mfn_t mfn = page_to_mfn(page);
+ unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+ page_count++;
+ if (
+#ifdef __x86_64__
+ (gfn != 0x5555555555555555L)
+#else
+ (gfn != 0x55555555L)
+#endif
+ && gfn != INVALID_M2P_ENTRY
+ && !shadow_set_p2m_entry(d, gfn, mfn) )
+ {
+ SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
SH_PRI_mfn "\n",
+ gfn, mfn_x(mfn));
+ return 0;
+ }
+ }
+
+ SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
+ return 1;
+}
+
+mfn_t
+sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+ mfn_t mfn;
+ unsigned long addr = gpfn << PAGE_SHIFT;
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+
+ ASSERT(shadow_mode_translate(d));
+ mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) )
+ /* This pfn is higher than the p2m map can hold */
+ return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+ {
+ l4_pgentry_t *l4e = sh_map_domain_page(mfn);
+ l4e += l4_table_offset(addr);
+ if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l4e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l4e_get_pfn(*l4e));
+ sh_unmap_domain_page(l4e);
+ }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+ {
+ l3_pgentry_t *l3e = sh_map_domain_page(mfn);
+ l3e += l3_table_offset(addr);
+ if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l3e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l3e_get_pfn(*l3e));
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+
+ l2e = sh_map_domain_page(mfn);
+ l2e += l2_table_offset(addr);
+ if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l2e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l2e_get_pfn(*l2e));
+ sh_unmap_domain_page(l2e);
+
+ l1e = sh_map_domain_page(mfn);
+ l1e += l1_table_offset(addr);
+ if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+ {
+ sh_unmap_domain_page(l1e);
+ return _mfn(INVALID_MFN);
+ }
+ mfn = _mfn(l1e_get_pfn(*l1e));
+ sh_unmap_domain_page(l1e);
+
+ return mfn;
+}
+
+unsigned long
+shadow_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+ return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+ struct list_head *entry, *n;
+ struct page_info *pg;
+
+ d->arch.phys_table = pagetable_null();
+
+ list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
+ {
+ pg = list_entry(entry, struct page_info, list);
+ list_del(entry);
+ /* Should have just the one ref we gave it in alloc_p2m_page() */
+ if ( (pg->count_info & PGC_SH_count_mask) != 1 )
+ {
+ SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+ pg->count_info, pg->u.inuse.type_info);
+ }
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation, since
+ * these pages were allocated without an owner. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow.p2m_pages--;
+ perfc_decr(shadow_alloc_count);
+ }
+ list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
+ {
+ list_del(entry);
+ pg = list_entry(entry, struct page_info, list);
+ ASSERT(page_get_owner(pg) == d);
+ /* Free should not decrement domain's total allocation. */
+ page_set_owner(pg, NULL);
+ free_domheap_pages(pg, 0);
+ d->arch.shadow.p2m_pages--;
+ perfc_decr(shadow_alloc_count);
+ }
+ ASSERT(d->arch.shadow.p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh_allocation(struct domain *d,
+ unsigned int pages,
+ int *preempted)
+{
+ struct page_info *pg;
+ unsigned int lower_bound;
+ int j;
+
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Don't allocate less than the minimum acceptable, plus one page per
+ * megabyte of RAM (for the p2m table) */
+ lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
+ if ( pages > 0 && pages < lower_bound )
+ pages = lower_bound;
+ /* Round up to largest block size */
+ pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
+
+ SHADOW_PRINTK("current %i target %i\n",
+ d->arch.shadow.total_pages, pages);
+
+ while ( d->arch.shadow.total_pages != pages )
+ {
+ if ( d->arch.shadow.total_pages < pages )
+ {
+ /* Need to allocate more memory from domheap */
+ pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0);
+ if ( pg == NULL )
+ {
+ SHADOW_PRINTK("failed to allocate shadow pages.\n");
+ return -ENOMEM;
+ }
+ d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
+ d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
+ for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ )
+ {
+ pg[j].u.inuse.type_info = 0; /* Free page */
+ pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+ }
+ SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
+ list_add_tail(&pg->list,
+ &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
+ }
+ else if ( d->arch.shadow.total_pages > pages )
+ {
+ /* Need to return memory to domheap */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+ ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
+ pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next,
+ struct page_info, list);
+ list_del(&pg->list);
+ d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
+ d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
+ free_domheap_pages(pg, SHADOW_MAX_ORDER);
+ }
+
+ /* Check to see if we need to yield and try again */
+ if ( preempted && hypercall_preempt_check() )
+ {
+ *preempted = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int shadow_set_allocation(struct domain *d,
+ unsigned int megabytes,
+ int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+ unsigned int rv;
+ shadow_lock(d);
+ rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted);
+ SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ shadow_get_allocation(d));
+ shadow_unlock(d);
+ return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh_hash(unsigned long n, u8 t)
+{
+ unsigned char *p = (unsigned char *)&n;
+ key_t k = t;
+ int i;
+ for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+ return k;
+}
+
+#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+ struct shadow_hash_entry *e, *x;
+ struct page_info *pg;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ e = &d->arch.shadow.hash_table[bucket];
+ if ( e->t == 0 ) return; /* Bucket is empty */
+ while ( e )
+ {
+ /* Empty link? */
+ BUG_ON( e->t == 0 );
+ /* Bogus type? */
+ BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
+ /* Wrong bucket? */
+ BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket );
+ /* Duplicate entry? */
+ for ( x = e->next; x; x = x->next )
+ BUG_ON( x->n == e->n && x->t == e->t );
+ /* Bogus MFN? */
+ BUG_ON( !valid_mfn(e->smfn) );
+ pg = mfn_to_page(e->smfn);
+ /* Not a shadow? */
+ BUG_ON( page_get_owner(pg) != 0 );
+ /* Wrong kind of shadow? */
+ BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift
+ != e->t );
+ /* Bad backlink? */
+ BUG_ON( pg->u.inuse.type_info != e->n );
+ if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
+ {
+ /* Bad shadow flags on guest page? */
+ BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
+ }
+ /* That entry was OK; on we go */
+ e = e->next;
+ }
+}
+
+#else
+#define sh_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
+
+static void sh_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+ int i;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
+ {
+ sh_hash_audit_bucket(d, i);
+ }
+}
+
+#else
+#define sh_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
+{
+ struct shadow_hash_entry *extra, *x;
+ int i;
+
+ /* We need to allocate a new node. Ensure the free list is not empty.
+ * Allocate new entries in units the same size as the original table. */
+ if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
+ {
+ size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
+ extra = xmalloc_bytes(sz);
+
+ if ( extra == NULL )
+ {
+ /* No memory left! */
+ SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
+ domain_crash_synchronous();
+ }
+ memset(extra, 0, sz);
+
+ /* Record the allocation block so it can be correctly freed later. */
+ *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) =
+ d->arch.shadow.hash_allocations;
+ d->arch.shadow.hash_allocations = &extra[0];
+
+ /* Thread a free chain through the newly-allocated nodes. */
+ for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
+ extra[i].next = &extra[i+1];
+ extra[i].next = NULL;
+
+ /* Add the new nodes to the free list. */
+ d->arch.shadow.hash_freelist = &extra[0];
+ }
+
+ /* Allocate a new node from the free list. */
+ x = d->arch.shadow.hash_freelist;
+ d->arch.shadow.hash_freelist = x->next;
+ return x;
+}
+
+static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
+{
+ /* Mark the bucket as empty and return it to the free list */
+ e->t = 0;
+ e->next = d->arch.shadow.hash_freelist;
+ d->arch.shadow.hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.
+ * Returns 0 for success, 1 for error. */
+static int shadow_hash_alloc(struct domain *d)
+{
+ struct shadow_hash_entry *table;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(!d->arch.shadow.hash_table);
+
+ table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
+ if ( !table ) return 1;
+ memset(table, 0,
+ SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
+ d->arch.shadow.hash_table = table;
+ return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow_hash_teardown(struct domain *d)
+{
+ struct shadow_hash_entry *a, *n;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+
+ /* Return the table itself */
+ xfree(d->arch.shadow.hash_table);
+ d->arch.shadow.hash_table = NULL;
+
+ /* Return any extra allocations */
+ a = d->arch.shadow.hash_allocations;
+ while ( a )
+ {
+ /* We stored a linked-list pointer at the end of each allocation */
+ n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
+ xfree(a);
+ a = n;
+ }
+ d->arch.shadow.hash_allocations = NULL;
+ d->arch.shadow.hash_freelist = NULL;
+}
+
+
+mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table. Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_lookups);
+ key = sh_hash(n, t);
+
+ x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+ p = NULL;
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ do
+ {
+ ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+ if ( x->n == n && x->t == t )
+ {
+ /* Pull-to-front if 'x' isn't already the head item */
+ if ( unlikely(x != head) )
+ {
+ if ( unlikely(d->arch.shadow.hash_walking != 0) )
+ /* Can't reorder: someone is walking the hash chains */
+ return x->smfn;
+ else
+ {
+ /* Delete 'x' from list and reinsert after head. */
+ p->next = x->next;
+ x->next = head->next;
+ head->next = x;
+
+ /* Swap 'x' contents with head contents. */
+ SWAP(head->n, x->n);
+ SWAP(head->t, x->t);
+ SWAP(head->smfn, x->smfn);
+ }
+ }
+ else
+ {
+ perfc_incrc(shadow_hash_lookup_head);
+ }
+ return head->smfn;
+ }
+
+ p = x;
+ x = x->next;
+ }
+ while ( x != NULL );
+
+ perfc_incrc(shadow_hash_lookup_miss);
+ return _mfn(INVALID_MFN);
+}
+
+void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_inserts);
+ key = sh_hash(n, t);
+
+ head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ /* If the bucket is empty then insert the new page as the head item. */
+ if ( head->t == 0 )
+ {
+ head->n = n;
+ head->t = t;
+ head->smfn = smfn;
+ ASSERT(head->next == NULL);
+ }
+ else
+ {
+ /* Insert a new entry directly after the head item. */
+ x = sh_alloc_hash_entry(d);
+ x->n = n;
+ x->t = t;
+ x->smfn = smfn;
+ x->next = head->next;
+ head->next = x;
+ }
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+}
+
+void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *p, *x, *head;
+ key_t key;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_table);
+ ASSERT(t);
+
+ sh_hash_audit(d);
+
+ perfc_incrc(shadow_hash_deletes);
+ key = sh_hash(n, t);
+
+ head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+
+ /* Match on head item? */
+ if ( head->n == n && head->t == t )
+ {
+ if ( (x = head->next) != NULL )
+ {
+ /* Overwrite head with contents of following node. */
+ head->n = x->n;
+ head->t = x->t;
+ head->smfn = x->smfn;
+
+ /* Delete following node. */
+ head->next = x->next;
+ sh_free_hash_entry(d, x);
+ }
+ else
+ {
+ /* This bucket is now empty. Initialise the head node. */
+ head->t = 0;
+ }
+ }
+ else
+ {
+ /* Not at the head; need to walk the chain */
+ p = head;
+ x = head->next;
+
+ while(1)
+ {
+ ASSERT(x); /* We can't have hit the end, since our target is
+ * still in the chain somehwere... */
+ if ( x->n == n && x->t == t )
+ {
+ /* Delete matching node. */
+ p->next = x->next;
+ sh_free_hash_entry(d, x);
+ break;
+ }
+ p = x;
+ x = x->next;
+ }
+ }
+
+ sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v,
+ unsigned int callback_mask,
+ hash_callback_t callbacks[],
+ mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and
+ * calling the appropriate callback function for each entry.
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan.
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they
+ * then return non-zero to terminate the scan. */
+{
+ int i, done = 0;
+ struct domain *d = v->domain;
+ struct shadow_hash_entry *x;
+
+ /* Say we're here, to stop hash-lookups reordering the chains */
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d->arch.shadow.hash_walking == 0);
+ d->arch.shadow.hash_walking = 1;
+
+ callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+ for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
+ {
+ /* WARNING: This is not safe against changes to the hash table.
+ * The callback *must* return non-zero if it has inserted or
+ * deleted anything from the hash (lookups are OK, though). */
+ for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
+ {
+ if ( callback_mask & (1 << x->t) )
+ {
+ ASSERT(x->t <= 15);
+ ASSERT(callbacks[x->t] != NULL);
+ if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+ break;
+ }
+ }
+ if ( done ) break;
+ }
+ d->arch.shadow.hash_walking = 0;
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the
+ * free pool. */
+
+void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ u32 t = pg->count_info & PGC_SH_type_mask;
+
+
+ SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+ /* Double-check, if we can, that the shadowed page belongs to this
+ * domain, (by following the back-pointer). */
+ ASSERT(t == PGC_SH_fl1_32_shadow ||
+ t == PGC_SH_fl1_pae_shadow ||
+ t == PGC_SH_fl1_64_shadow ||
+ t == PGC_SH_monitor_table ||
+ (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info)))
+ == v->domain));
+
+ /* The down-shifts here are so that the switch statement is on nice
+ * small numbers that the compiler will enjoy */
+ switch ( t >> PGC_SH_type_shift )
+ {
+#if CONFIG_PAGING_LEVELS == 2
+ case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn);
+ break;
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
+ break;
+#else /* PAE or 64bit */
+ case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
+ break;
+ case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+ case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
+ case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
+ break;
+ case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
+ break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
+ case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
+ break;
+ case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
+ SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
+ break;
+#endif
+ default:
+ SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n",
+ (unsigned long)t);
+ BUG();
+ }
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables
+ * Returns non-zero if we need to flush TLBs.
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn,
+ unsigned int level,
+ unsigned long fault_addr)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
+ ;
+ struct page_info *pg = mfn_to_page(gmfn);
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ /* Only remove writable mappings if we are doing shadow refcounts.
+ * In guest refcounting, we trust Xen to already be restricting
+ * all the writes to the guest page tables, so we do not need to
+ * do more. */
+ if ( !shadow_mode_refcounts(v->domain) )
+ return 0;
+
+ /* Early exit if it's already a pagetable, or otherwise not writeable */
+ if ( sh_mfn_is_a_page_table(gmfn)
+ || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+ return 0;
+
+ perfc_incrc(shadow_writeable);
+
+ /* If this isn't a "normal" writeable page, the domain is trying to
+ * put pagetables in special memory of some kind. We can't allow that. */
+ if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+ {
+ SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
+ PRtype_info "\n",
+ mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+ domain_crash(v->domain);
+ }
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+ if ( v == current && level != 0 )
+ {
+ unsigned long gfn;
+ /* Heuristic: there is likely to be only one writeable mapping,
+ * and that mapping is likely to be in the current pagetable,
+ * either in the guest's linear map (linux, windows) or in a
+ * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do { \
+ if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \
+ perfc_incrc(shadow_writeable_h_ ## _h); \
+ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
+ return 1; \
+ } while (0)
+
+
+ /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+ if ( v == current
+ && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+ GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+ if ( v->arch.shadow.mode->guest_levels == 2 )
+ {
+ if ( level == 1 )
+ /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+ GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ else if ( v->arch.shadow.mode->guest_levels == 3 )
+ {
+ /* 32bit PAE w2k3: linear map at 0xC0000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+ case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+ }
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ else if ( v->arch.shadow.mode->guest_levels == 4 )
+ {
+ /* 64bit w2k3: linear map at 0x0000070000000000 */
+ switch ( level )
+ {
+ case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+ case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+ case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+ }
+ }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+ }
+#endif
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow_writeable_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+ {
+ SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
+ "%lu left\n", mfn_x(gmfn),
+ (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+ domain_crash(v->domain);
+ }
+
+ /* We killed at least one writeable mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+ struct page_info *page = mfn_to_page(gmfn);
+ int expected_count;
+
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */
+#endif
+ NULL, /* l2_32 */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
+#else
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#endif
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */
+#else
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#endif
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ static unsigned int callback_mask =
+ 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
+ | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
+ ;
+
+ perfc_incrc(shadow_mappings);
+ if ( (page->count_info & PGC_count_mask) == 0 )
+ return 0;
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ /* XXX TODO:
+ * Heuristics for finding the (probably) single mapping of this gmfn */
+
+ /* Brute-force search of all the shadows, by walking the hash */
+ perfc_incrc(shadow_mappings_bf);
+ hash_foreach(v, callback_mask, callbacks, gmfn);
+
+ /* If that didn't catch the mapping, something is very wrong */
+ expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+ if ( (page->count_info & PGC_count_mask) != expected_count )
+ {
+ /* Don't complain if we're in HVM and there's one extra mapping:
+ * The qemu helper process has an untyped mapping of this dom's RAM */
+ if ( !(shadow_mode_external(v->domain)
+ && (page->count_info & PGC_count_mask) <= 2
+ && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+ {
+ SHADOW_ERROR("can't find all mappings of mfn %lx: "
+ "c=%08x t=%08lx\n", mfn_x(gmfn),
+ page->count_info, page->u.inuse.type_info);
+ }
+ }
+
+ /* We killed at least one mapping, so must flush TLBs. */
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there. Returns 1 if that was the only reference to this shadow */
+{
+ struct page_info *pg = mfn_to_page(smfn);
+ mfn_t pmfn;
+ void *vaddr;
+ int rc;
+
+ ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
+ ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
+ ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
+
+ if (pg->up == 0) return 0;
+ pmfn = _mfn(pg->up >> PAGE_SHIFT);
+ ASSERT(valid_mfn(pmfn));
+ vaddr = sh_map_domain_page(pmfn);
+ ASSERT(vaddr);
+ vaddr += pg->up & (PAGE_SIZE-1);
+ ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+
+ /* Is this the only reference to this shadow? */
+ rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
+
+ /* Blank the offending entry */
+ switch ((pg->count_info & PGC_SH_type_mask))
+ {
+ case PGC_SH_l1_32_shadow:
+ case PGC_SH_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+ break;
+#if CONFIG_PAGING_LEVELS >=3
+ case PGC_SH_l1_pae_shadow:
+ case PGC_SH_l2_pae_shadow:
+ case PGC_SH_l2h_pae_shadow:
+ case PGC_SH_l3_pae_shadow:
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+ break;
+#if CONFIG_PAGING_LEVELS >= 4
+ case PGC_SH_l1_64_shadow:
+ case PGC_SH_l2_64_shadow:
+ case PGC_SH_l3_64_shadow:
+ case PGC_SH_l4_64_shadow:
+ SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+ break;
+#endif
+#endif
+ default: BUG(); /* Some wierd unknown shadow type */
+ }
+
+ sh_unmap_domain_page(vaddr);
+ if ( rc )
+ perfc_incrc(shadow_up_pointer);
+ else
+ perfc_incrc(shadow_unshadow_bf);
+
+ return rc;
+}
+
+void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove
+ * at most one reference to each shadow of the page. */
+{
+ struct page_info *pg;
+ mfn_t smfn;
+ u32 sh_flags;
+ unsigned char t;
+
+ /* Dispatch table for getting per-type functions: each level must
+ * be called with the function to remove a lower-level shadow. */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+ NULL, /* l1_32 */
+ NULL, /* fl1_32 */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */
+#endif
+ NULL, /* l1_pae */
+ NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
+ SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */
+#else
+ NULL, /* l2_pae */
+ NULL, /* l2h_pae */
+ NULL, /* l3_pae */
+#endif
+ NULL, /* l1_64 */
+ NULL, /* fl1_64 */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */
+ SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */
+#else
+ NULL, /* l2_64 */
+ NULL, /* l3_64 */
+ NULL, /* l4_64 */
+#endif
+ NULL, /* p2m */
+ NULL /* unused */
+ };
+
+ /* Another lookup table, for choosing which mask to use */
+ static unsigned int masks[16] = {
+ 0, /* none */
+ 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */
+ 0, /* fl1_32 */
+ 0, /* l2_32 */
+ ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
+ | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */
+ 0, /* fl1_pae */
+ 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */
+ 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */
+ 0, /* l3_pae */
+ 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */
+ 0, /* fl1_64 */
+ 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */
+ 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */
+ 0, /* l4_64 */
+ 0, /* p2m */
+ 0 /* unused */
+ };
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ pg = mfn_to_page(gmfn);
+
+ /* Bale out now if the page is not shadowed */
+ if ( (pg->count_info & PGC_page_table) == 0 )
+ return;
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+ /* Search for this shadow in all appropriate shadows */
+ perfc_incrc(shadow_unshadow);
+ sh_flags = pg->shadow_flags;
+
+ /* Lower-level shadows need to be excised from upper-level shadows.
+ * This call to hash_foreach() looks dangerous but is in fact OK: each
+ * call will remove at most one shadow, and terminate immediately when
+ * it does remove it, so we never walk the hash after doing a deletion. */
+#define DO_UNSHADOW(_type) do { \
+ t = (_type) >> PGC_SH_type_shift; \
+ smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \
+ hash_foreach(v, masks[t], callbacks, smfn); \
+} while (0)
+
+ /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do { \
+ t = (_type) >> PGC_SH_type_shift; \
+ smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
+ if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \
+ sh_unpin(v, smfn); \
+ if ( (_type) == PGC_SH_l3_pae_shadow ) \
+ SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+ if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow);
+ if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+ if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow);
+ if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow);
+ if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
+ if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+ if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow);
+ if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow);
+ if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow);
+ if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+ /* We may have caused some PAE l3 entries to change: need to
+ * fix up the copies of them in various places */
+ if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
+ sh_pae_recopy(v->domain);
+#endif
+
+ /* If that didn't catch the shadows, something is wrong */
+ if ( all && (pg->count_info & PGC_page_table) )
+ {
+ SHADOW_ERROR("can't find all shadows of mfn %05lx
(shadow_flags=%08x)\n",
+ mfn_x(gmfn), pg->shadow_flags);
+ domain_crash(v->domain);
+ }
+}
+
+void
+shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+ shadow_remove_all_shadows(v, gmfn);
+ /* XXX TODO:
+ * Rework this hashtable walker to return a linked-list of all
+ * the shadows it modified, then do breadth-first recursion
+ * to find the way up to higher-level tables and unshadow them too.
+ *
+ * The current code (just tearing down each page's shadows as we
+ * detect that it is not a pagetable) is correct, but very slow.
+ * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh_update_paging_modes(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
+ mfn_t old_guest_table;
+
+ ASSERT(shadow_lock_is_acquired(d));
+
+ // Valid transitions handled by this function:
+ // - For PV guests:
+ // - after a shadow mode has been changed
+ // - For HVM guests:
+ // - after a shadow mode has been changed
+ // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+ //
+
+ // Avoid determining the current shadow mode for uninitialized CPUs, as
+ // we can not yet determine whether it is an HVM or PV domain.
+ //
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ printk("%s: postponing determination of shadow mode\n", __func__);
+ return;
+ }
+
+ // First, tear down any old shadow tables held by this vcpu.
+ //
+ shadow_detach_old_tables(v);
+
+ if ( !hvm_guest(v) )
+ {
+ ///
+ /// PV guest
+ ///
+#if CONFIG_PAGING_LEVELS == 4
+ if ( pv_32bit_guest(v) )
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
+ else
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#elif CONFIG_PAGING_LEVELS == 2
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
+#else
+#error unexpected paging mode
+#endif
+ }
+ else
+ {
+ ///
+ /// HVM guest
+ ///
+ ASSERT(shadow_mode_translate(d));
+ ASSERT(shadow_mode_external(d));
+
+ v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
+ if ( !v->arch.shadow.hvm_paging_enabled )
+ {
+
+ /* Set v->arch.guest_table to use the p2m map, and choose
+ * the appropriate shadow mode */
+ old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
+#elif CONFIG_PAGING_LEVELS == 3
+ v->arch.guest_table =
+ pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+ {
+ l4_pgentry_t *l4e;
+ /* Use the start of the first l3 table as a PAE l3 */
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ l4e =
sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ v->arch.guest_table =
+ pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+ sh_unmap_domain_page(l4e);
+ }
+ v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
+#endif
+ /* Fix up refcounts on guest_table */
+ get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+ if ( mfn_x(old_guest_table) != 0 )
+ put_page(mfn_to_page(old_guest_table));
+ }
+ else
+ {
+#ifdef __x86_64__
+ if ( hvm_long_mode_enabled(v) )
+ {
+ // long mode guest...
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
+ }
+ else
+#endif
+ if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+ {
+#if CONFIG_PAGING_LEVELS >= 3
+ // 32-bit PAE mode guest...
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
+#else
+ SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
+ domain_crash(d);
+ return;
+#endif
+ }
+ else
+ {
+ // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
+#else
+ v->arch.shadow.mode =
+ &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
+#endif
+ }
+ }
+
+ if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ {
+ mfn_t mmfn = shadow_make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(mmfn);
+ v->arch.monitor_vtable = sh_map_domain_page(mmfn);
+ }
+
+ if ( v->arch.shadow.mode != old_mode )
+ {
+ SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+ "(was g=%u s=%u)\n",
+ d->domain_id, v->vcpu_id,
+ v->arch.shadow.mode->guest_levels,
+ v->arch.shadow.mode->shadow_levels,
+ old_mode ? old_mode->guest_levels : 0,
+ old_mode ? old_mode->shadow_levels : 0);
+ if ( old_mode &&
+ (v->arch.shadow.mode->shadow_levels !=
+ old_mode->shadow_levels) )
+ {
+ /* Need to make a new monitor table for the new mode */
+ mfn_t new_mfn, old_mfn;
+
+ if ( v != current )
+ {
+ SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
+ "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+ current->domain->domain_id, current->vcpu_id,
+ v->domain->domain_id, v->vcpu_id);
+ domain_crash(v->domain);
+ return;
+ }
+
+ sh_unmap_domain_page(v->arch.monitor_vtable);
+ old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+ v->arch.monitor_table = pagetable_null();
+ new_mfn = v->arch.shadow.mode->make_monitor_table(v);
+ v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+ v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
+ SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
+ mfn_x(new_mfn));
+
+ /* Don't be running on the old monitor table when we
+ * pull it down! Switch CR3, and warn the HVM code that
+ * its host cr3 has changed. */
+ make_cr3(v, mfn_x(new_mfn));
+ write_ptbase(v);
+ hvm_update_host_cr3(v);
+ old_mode->destroy_monitor_table(v, old_mfn);
+ }
+ }
+
+ // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+ // These are HARD: think about the case where two CPU's have
+ // different values for CR4.PSE and CR4.PGE at the same time.
+ // This *does* happen, at least for CR4.PGE...
+ }
+
+ v->arch.shadow.mode->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow features */
+
+static void sh_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+ struct vcpu *v;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(d != current->domain);
+ d->arch.shadow.mode = new_mode;
+ if ( new_mode & SHM2_translate )
+ shadow_audit_p2m(d);
+ for_each_vcpu(d, v)
+ sh_update_paging_modes(v);
+}
+
+static int shadow_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled.
+ * Returns 0 for success, -errno for failure. */
+{
+ unsigned int old_pages;
+ int rv = 0;
+
+ mode |= SHM2_enable;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ /* Sanity check the arguments */
+ if ( (d == current->domain) ||
+ shadow_mode_enabled(d) ||
+ ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ // XXX -- eventually would like to require that all memory be allocated
+ // *after* shadow_enabled() is called... So here, we would test to make
+ // sure that d->page_list is empty.
+#if 0
+ spin_lock(&d->page_alloc_lock);
+ if ( !list_empty(&d->page_list) )
+ {
+ spin_unlock(&d->page_alloc_lock);
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_unlock(&d->page_alloc_lock);
+#endif
+
+ /* Init the shadow memory allocation if the user hasn't done so */
+ old_pages = d->arch.shadow.total_pages;
+ if ( old_pages == 0 )
+ if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+ {
+ set_sh_allocation(d, 0, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the hash table */
+ if ( shadow_hash_alloc(d) != 0 )
+ {
+ set_sh_allocation(d, old_pages, NULL);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Init the P2M table */
+ if ( mode & SHM2_translate )
+ if ( !shadow_alloc_p2m_table(d) )
+ {
+ shadow_hash_teardown(d);
+ set_sh_allocation(d, old_pages, NULL);
+ shadow_p2m_teardown(d);
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, mode);
+ shadow_audit_p2m(d);
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+void shadow_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+ struct vcpu *v;
+ mfn_t mfn;
+
+ ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+ ASSERT(d != current->domain);
+
+ if ( !shadow_lock_is_acquired(d) )
+ shadow_lock(d); /* Keep various asserts happy */
+
+ if ( shadow_mode_enabled(d) )
+ {
+ /* Release the shadow and monitor tables held by each vcpu */
+ for_each_vcpu(d, v)
+ {
+ shadow_detach_old_tables(v);
+ if ( shadow_mode_external(d) )
+ {
+ mfn = pagetable_get_mfn(v->arch.monitor_table);
+ if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+ shadow_destroy_monitor_table(v, mfn);
+ v->arch.monitor_table = pagetable_null();
+ }
+ }
+ }
+
+ if ( d->arch.shadow.total_pages != 0 )
+ {
+ SHADOW_PRINTK("teardown of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ /* Destroy all the shadows and release memory to domheap */
+ set_sh_allocation(d, 0, NULL);
+ /* Release the hash table back to xenheap */
+ if (d->arch.shadow.hash_table)
+ shadow_hash_teardown(d);
+ /* Release the log-dirty bitmap of dirtied pages */
+ sh_free_log_dirty_bitmap(d);
+ /* Should not have any more memory held */
+ SHADOW_PRINTK("teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ ASSERT(d->arch.shadow.total_pages == 0);
+ }
+
+ /* We leave the "permanent" shadow modes enabled, but clear the
+ * log-dirty mode bit. We don't want any more mark_dirty()
+ * calls now that we've torn down the bitmap */
+ d->arch.shadow.mode &= ~SHM2_log_dirty;
+
+ shadow_unlock(d);
+}
+
+void shadow_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+ SHADOW_PRINTK("dom %u final teardown starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+
+ /* Double-check that the domain didn't have any shadow memory.
+ * It is possible for a domain that never got domain_kill()ed
+ * to get here with its shadow allocation intact. */
+ if ( d->arch.shadow.total_pages != 0 )
+ shadow_teardown(d);
+
+ /* It is now safe to pull down the p2m map. */
+ if ( d->arch.shadow.p2m_pages != 0 )
+ shadow_p2m_teardown(d);
+
+ SHADOW_PRINTK("dom %u final teardown done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+}
+
+static int shadow_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || (d->arch.shadow.mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ if ( d->arch.shadow.mode == 0 )
+ {
+ /* Init the shadow memory allocation and the hash table */
+ if ( set_sh_allocation(d, 1, NULL) != 0
+ || shadow_hash_alloc(d) != 0 )
+ {
+ set_sh_allocation(d, 0, NULL);
+ return -ENOMEM;
+ }
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, d->arch.shadow.mode | mode);
+
+ return 0;
+}
+
+static int shadow_one_bit_disable(struct domain *d, u32 mode)
+/* Turn off a single shadow mode feature */
+{
+ struct vcpu *v;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ /* Sanity check the call */
+ if ( d == current->domain || !(d->arch.shadow.mode & mode) )
+ {
+ return -EINVAL;
+ }
+
+ /* Update the bits */
+ sh_new_mode(d, d->arch.shadow.mode & ~mode);
+ if ( d->arch.shadow.mode == 0 )
+ {
+ /* Get this domain off shadows */
+ SHADOW_PRINTK("un-shadowing of domain %u starts."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ for_each_vcpu(d, v)
+ {
+ shadow_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+ else
+#endif
+ make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+ }
+
+ /* Pull down the memory allocation */
+ if ( set_sh_allocation(d, 0, NULL) != 0 )
+ {
+ // XXX - How can this occur?
+ // Seems like a bug to return an error now that we've
+ // disabled the relevant shadow mode.
+ //
+ return -ENOMEM;
+ }
+ shadow_hash_teardown(d);
+ SHADOW_PRINTK("un-shadowing of domain %u done."
+ " Shadow pages total = %u, free = %u, p2m=%u\n",
+ d->domain_id,
+ d->arch.shadow.total_pages,
+ d->arch.shadow.free_pages,
+ d->arch.shadow.p2m_pages);
+ }
+
+ return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow_test_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ if ( shadow_mode_enabled(d) )
+ {
+ SHADOW_ERROR("Don't support enabling test mode"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = shadow_one_bit_enable(d, SHM2_enable);
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+int shadow_test_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+ ret = shadow_one_bit_disable(d, SHM2_enable);
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+static int
+sh_alloc_log_dirty_bitmap(struct domain *d)
+{
+ ASSERT(d->arch.shadow.dirty_bitmap == NULL);
+ d->arch.shadow.dirty_bitmap_size =
+ (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+ ~(BITS_PER_LONG - 1);
+ d->arch.shadow.dirty_bitmap =
+ xmalloc_array(unsigned long,
+ d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
+ if ( d->arch.shadow.dirty_bitmap == NULL )
+ {
+ d->arch.shadow.dirty_bitmap_size = 0;
+ return -ENOMEM;
+ }
+ memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
+
+ return 0;
+}
+
+static void
+sh_free_log_dirty_bitmap(struct domain *d)
+{
+ d->arch.shadow.dirty_bitmap_size = 0;
+ if ( d->arch.shadow.dirty_bitmap )
+ {
+ xfree(d->arch.shadow.dirty_bitmap);
+ d->arch.shadow.dirty_bitmap = NULL;
+ }
+}
+
+static int shadow_log_dirty_enable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ if ( shadow_mode_log_dirty(d) )
+ {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ( shadow_mode_enabled(d) )
+ {
+ SHADOW_ERROR("Don't (yet) support enabling log-dirty"
+ "on already shadowed doms\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sh_alloc_log_dirty_bitmap(d);
+ if ( ret != 0 )
+ {
+ sh_free_log_dirty_bitmap(d);
+ goto out;
+ }
+
+ ret = shadow_one_bit_enable(d, SHM2_log_dirty);
+ if ( ret != 0 )
+ sh_free_log_dirty_bitmap(d);
+
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return ret;
+}
+
+static int shadow_log_dirty_disable(struct domain *d)
+{
+ int ret;
+
+ domain_pause(d);
+ shadow_lock(d);
+ ret = shadow_one_bit_disable(d, SHM2_log_dirty);
+ if ( !shadow_mode_log_dirty(d) )
+ sh_free_log_dirty_bitmap(d);
+ shadow_unlock(d);
+ domain_unpause(d);
+
+ return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+ struct vcpu *v;
+
+ if ( !shadow_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+
+ SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
+ //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
+
+ shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
+ if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ shadow_lock(d);
+ shadow_audit_p2m(d);
+ sh_p2m_remove_page(d, gfn, mfn);
+ shadow_audit_p2m(d);
+ shadow_unlock(d);
+}
+
+void
+shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+ unsigned long mfn)
+{
+ struct vcpu *v;
+ unsigned long ogfn;
+ mfn_t omfn;
+
+ if ( !shadow_mode_translate(d) )
+ return;
+
+ v = current;
+ if ( v->domain != d )
+ v = d->vcpu[0];
+
+ shadow_lock(d);
+ shadow_audit_p2m(d);
+
+ SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+ omfn = sh_gfn_to_mfn(d, gfn);
+ if ( valid_mfn(omfn) )
+ {
+ /* Get rid of the old mapping, especially any shadows */
+ shadow_remove_all_shadows_and_parents(v, omfn);
+ if ( shadow_remove_all_mappings(v, omfn) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+ }
+
+ ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
+ if (
+#ifdef __x86_64__
+ (ogfn != 0x5555555555555555L)
+#else
+ (ogfn != 0x55555555L)
+#endif
+ && (ogfn != INVALID_M2P_ENTRY)
+ && (ogfn != gfn) )
+ {
+ /* This machine frame is already mapped at another physical address */
+ SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+ mfn, ogfn, gfn);
+ if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) )
+ {
+ SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n",
+ ogfn , mfn_x(omfn));
+ if ( mfn_x(omfn) == mfn )
+ sh_p2m_remove_page(d, ogfn, mfn);
+ }
+ }
+
+ shadow_set_p2m_entry(d, gfn, _mfn(mfn));
+ set_gpfn_from_mfn(mfn, gfn);
+ shadow_audit_p2m(d);
+ shadow_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+ BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow_log_dirty_op(
+ struct domain *d, struct xen_domctl_shadow_op *sc)
+{
+ int i, rv = 0, clean = 0;
+
+ domain_pause(d);
+ shadow_lock(d);
+
+ clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+
+ SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+ (clean) ? "clean" : "peek",
+ d->domain_id,
+ d->arch.shadow.fault_count,
+ d->arch.shadow.dirty_count);
+
+ sc->stats.fault_count = d->arch.shadow.fault_count;
+ sc->stats.dirty_count = d->arch.shadow.dirty_count;
+
+ if ( clean )
+ {
+ struct list_head *l, *t;
+ struct page_info *pg;
+
+ /* Need to revoke write access to the domain's pages again.
+ * In future, we'll have a less heavy-handed approach to this,
+ * but for now, we just unshadow everything except Xen. */
+ list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
+ {
+ pg = list_entry(l, struct page_info, list);
+ shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+ }
+
+ d->arch.shadow.fault_count = 0;
+ d->arch.shadow.dirty_count = 0;
+ }
+
+ if ( guest_handle_is_null(sc->dirty_bitmap) ||
+ (d->arch.shadow.dirty_bitmap == NULL) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
+ sc->pages = d->arch.shadow.dirty_bitmap_size;
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+ for ( i = 0; i < sc->pages; i += CHUNK )
+ {
+ int bytes = ((((sc->pages - i) > CHUNK)
+ ? CHUNK
+ : (sc->pages - i)) + 7) / 8;
+
+ if ( copy_to_guest_offset(
+ sc->dirty_bitmap,
+ i/(8*sizeof(unsigned long)),
+ d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+ {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ if ( clean )
+ memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+#undef CHUNK
+
+ out:
+ shadow_unlock(d);
+ domain_unpause(d);
+ return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+ unsigned long pfn;
+
+ ASSERT(shadow_lock_is_acquired(d));
+ ASSERT(shadow_mode_log_dirty(d));
+
+ if ( !valid_mfn(gmfn) )
+ return;
+
+ ASSERT(d->arch.shadow.dirty_bitmap != NULL);
+
+ /* We /really/ mean PFN here, even for non-translated guests. */
+ pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+ /*
+ * Values with the MSB set denote MFNs that aren't really part of the
+ * domain's pseudo-physical memory map (e.g., the shared info frame).
+ * Nothing to do here...
+ */
+ if ( unlikely(!VALID_M2P(pfn)) )
+ return;
+
+ /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
+ if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) )
+ {
+ if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
+ {
+ SHADOW_DEBUG(LOGDIRTY,
+ "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
+ mfn_x(gmfn), pfn, d->domain_id);
+ d->arch.shadow.dirty_count++;
+ }
+ }
+ else
+ {
+ SHADOW_PRINTK("mark_dirty OOR! "
+ "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+ "owner=%d c=%08x t=%" PRtype_info "\n",
+ mfn_x(gmfn),
+ pfn,
+ d->arch.shadow.dirty_bitmap_size,
+ d->domain_id,
+ (page_get_owner(mfn_to_page(gmfn))
+ ? page_get_owner(mfn_to_page(gmfn))->domain_id
+ : -1),
+ mfn_to_page(gmfn)->count_info,
+ mfn_to_page(gmfn)->u.inuse.type_info);
+ }
+}
+
+
+/**************************************************************************/
+/* Shadow-control XEN_DOMCTL dispatcher */
+
+int shadow_domctl(struct domain *d,
+ xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
+{
+ int rc, preempted = 0;
+
+ if ( unlikely(d == current->domain) )
+ {
+ DPRINTK("Don't try to do a shadow op on yourself!\n");
+ return -EINVAL;
+ }
+
+ switch ( sc->op )
+ {
+ case XEN_DOMCTL_SHADOW_OP_OFF:
+ if ( shadow_mode_log_dirty(d) )
+ if ( (rc = shadow_log_dirty_disable(d)) != 0 )
+ return rc;
+ if ( d->arch.shadow.mode & SHM2_enable )
+ if ( (rc = shadow_test_disable(d)) != 0 )
+ return rc;
+ return 0;
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
+ return shadow_test_enable(d);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+ return shadow_log_dirty_enable(d);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
+ return shadow_enable(d, SHM2_refcounts|SHM2_translate);
+
+ case XEN_DOMCTL_SHADOW_OP_CLEAN:
+ case XEN_DOMCTL_SHADOW_OP_PEEK:
+ return shadow_log_dirty_op(d, sc);
+
+ case XEN_DOMCTL_SHADOW_OP_ENABLE:
+ if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
+ return shadow_log_dirty_enable(d);
+ return shadow_enable(d, sc->mode << SHM2_shift);
+
+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+ sc->mb = shadow_get_allocation(d);
+ return 0;
+
+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+ rc = shadow_set_allocation(d, sc->mb, &preempted);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow_get_allocation(d);
+ return rc;
+
+ default:
+ SHADOW_ERROR("Bad shadow op %u\n", sc->op);
+ return -EINVAL;
+ }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+
+void shadow_audit_tables(struct vcpu *v)
+{
+ /* Dispatch table for getting per-type functions */
+ static hash_callback_t callbacks[16] = {
+ NULL, /* none */
+#if CONFIG_PAGING_LEVELS == 2
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */
+#else
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */
+ SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */
+#if CONFIG_PAGING_LEVELS >= 4
+ SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */
+ SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+ NULL /* All the rest */
+ };
+ unsigned int mask;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+ if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
+ mask = ~1; /* Audit every table in the system */
+ else
+ {
+ /* Audit only the current mode's tables */
+ switch ( v->arch.shadow.mode->guest_levels )
+ {
+ case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
+ case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
+ |SHF_L2H_PAE|SHF_L3_PAE); break;
+ case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64
+ |SHF_L3_64|SHF_L4_64); break;
+ default: BUG();
+ }
+ }
+
+ hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
+
+void shadow_audit_p2m(struct domain *d)
+{
+ struct list_head *entry;
+ struct page_info *page;
+ struct domain *od;
+ unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+ mfn_t p2mfn;
+ unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+ int test_linear;
+
+ if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
+ return;
+
+ //SHADOW_PRINTK("p2m audit starts\n");
+
+ test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+ if ( test_linear )
+ local_flush_tlb();
+
+ /* Audit part one: walk the domain's page allocation list, checking
+ * the m2p entries. */
+ for ( entry = d->page_list.next;
+ entry != &d->page_list;
+ entry = entry->next )
+ {
+ page = list_entry(entry, struct page_info, list);
+ mfn = mfn_x(page_to_mfn(page));
+
+ // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn);
+
+ od = page_get_owner(page);
+
+ if ( od != d )
+ {
+ SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+ mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+ continue;
+ }
+
+ gfn = get_gpfn_from_mfn(mfn);
+ if ( gfn == INVALID_M2P_ENTRY )
+ {
+ orphans_i++;
+ //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+ // mfn);
+ continue;
+ }
+
+ if ( gfn == 0x55555555 )
+ {
+ orphans_d++;
+ //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n",
+ // mfn);
+ continue;
+ }
+
+ p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
+ if ( mfn_x(p2mfn) != mfn )
+ {
+ mpbad++;
+ SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+ " (-> gfn %#lx)\n",
+ mfn, gfn, mfn_x(p2mfn),
+ (mfn_valid(p2mfn)
+ ? get_gpfn_from_mfn(mfn_x(p2mfn))
+ : -1u));
+ /* This m2p entry is stale: the domain has another frame in
+ * this physical slot. No great disaster, but for neatness,
+ * blow away the m2p entry. */
+ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+ }
+
+ if ( test_linear )
+ {
+ lp2mfn = get_mfn_from_gpfn(gfn);
+ if ( lp2mfn != mfn_x(p2mfn) )
+ {
+ SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+ "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+ }
+ }
+
+ // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n",
+ // mfn, gfn, p2mfn, lp2mfn);
+ }
+
+ /* Audit part two: walk the domain's p2m table, checking the entries. */
+ if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+ {
+ l2_pgentry_t *l2e;
+ l1_pgentry_t *l1e;
+ int i1, i2;
+
+#if CONFIG_PAGING_LEVELS == 4
+ l4_pgentry_t *l4e;
+ l3_pgentry_t *l3e;
+ int i3, i4;
+ l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+ l3_pgentry_t *l3e;
+ int i3;
+ l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+ gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+ for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+ {
+ if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+ for ( i3 = 0;
+ i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8);
+ i3++ )
+ {
+ if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+ {
+ if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+ {
+ gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+
+ for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+ {
+ if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+ continue;
+ mfn = l1e_get_pfn(l1e[i1]);
+ ASSERT(valid_mfn(_mfn(mfn)));
+ m2pfn = get_gpfn_from_mfn(mfn);
+ if ( m2pfn != gfn )
+ {
+ pmbad++;
+ SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+ " -> gfn %#lx\n", gfn, mfn, m2pfn);
+ BUG();
+ }
+ }
+ sh_unmap_domain_page(l1e);
+ }
+#if CONFIG_PAGING_LEVELS >= 3
+ sh_unmap_domain_page(l2e);
+ }
+#if CONFIG_PAGING_LEVELS >= 4
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+ sh_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+ sh_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+ sh_unmap_domain_page(l2e);
+#endif
+
+ }
+
+ //SHADOW_PRINTK("p2m audit complete\n");
+ //if ( orphans_i | orphans_d | mpbad | pmbad )
+ // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+ // orphans_i + orphans_d, orphans_i, orphans_d,
+ if ( mpbad | pmbad )
+ SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+ pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/multi.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c Mon Aug 28 16:26:37 2006 -0600
@@ -0,0 +1,4492 @@
+/******************************************************************************
+ * arch/x86/mm/shadow/multi.c
+ *
+ * Simple, mostly-synchronous shadow page tables.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+// backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+// figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+// space for both PV and HVM guests.
+//
+
+#define SHADOW 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+#include "private.h"
+#include "types.h"
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode).
+ *
+ * THINGS TO DO LATER:
+ *
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE. Should return a gfn instead.
+ *
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's
+ * shadows. When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows. Start with
+ * shadows in a page in two modes as a hint, but beware of clever tricks
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps. Add appropriate unmap_l*e calls in the users.
+ * Then we can test the speed difference made by linear maps. If the
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them
+ * to share l2h pages again.
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an
+ * entry in it, and every time we change CR3. We copy it for the linear
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3. Maybe we can avoid some of this recopying
+ * by using the shadow directly in some places.
+ * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable. Should coalesce the flushes to the end,
+ * and if we do flush, re-do the walk. If anything has changed, then
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND 2
+#define FETCH_TYPE_WRITE 4
+typedef enum {
+ ft_prefetch = FETCH_TYPE_PREFETCH,
+ ft_demand_read = FETCH_TYPE_DEMAND,
+ ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifdef DEBUG_TRACE_DUMP
+static char *fetch_type_names[] = {
+ [ft_prefetch] "prefetch",
+ [ft_demand_read] "demand read",
+ [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab,
int clear_res);
+#endif
+static inline void sh_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's: maps the *gfn* of the start of a superpage to the mfn of a
+ * shadow L1 which maps its "splinters".
+ * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ * PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+ mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift);
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH_log_dirty) )
+ shadow_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline mfn_t
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+ mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH_type_shift);
+ perfc_incrc(shadow_get_shadow_status);
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+ {
+ struct page_info *page = mfn_to_page(smfn);
+ if ( !(page->count_info & PGC_SH_log_dirty) )
+ shadow_convert_to_log_dirty(v, smfn);
+ }
+
+ return smfn;
+}
+
+static inline void
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+ SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
+
+ if ( unlikely(shadow_mode_log_dirty(v->domain)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ shadow_hash_insert(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
+}
+
+static inline void
+set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Put a shadow into the hash table */
+{
+ struct domain *d = v->domain;
+ int res;
+
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ d->domain_id, v->vcpu_id, mfn_x(gmfn),
+ shadow_type, mfn_x(smfn));
+
+ if ( unlikely(shadow_mode_log_dirty(d)) )
+ // mark this shadow as a log dirty shadow...
+ set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+ else
+ clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
+
+ res = get_page(mfn_to_page(gmfn), d);
+ ASSERT(res == 1);
+
+ shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift,
+ smfn);
+}
+
+static inline void
+delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
+ gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
+
+ shadow_hash_delete(v, gfn_x(gfn),
+ PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
+}
+
+static inline void
+delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+ SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+ v->domain->domain_id, v->vcpu_id,
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+ shadow_hash_delete(v, mfn_x(gmfn),
+ shadow_type >> PGC_SH_type_shift, smfn);
+ put_page(mfn_to_page(gmfn));
+}
+
+/**************************************************************************/
+/* CPU feature support querying */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+ /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+ * CR4.PSE is set or the guest is in PAE or long mode */
+ return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2
+ || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+ if ( !hvm_guest(v) )
+ return cpu_has_nx;
+
+ // XXX - fix this!
+ return 1;
+}
+
+
+/**************************************************************************/
+/* Functions for walking the guest page tables */
+
+
+/* Walk the guest pagetables, filling the walk_t with what we see.
+ * Takes an uninitialised walk_t. The caller must call unmap_walk()
+ * on the walk_t before discarding it or calling guest_walk_tables again.
+ * If "guest_op" is non-zero, we are serving a genuine guest memory access,
+ * and must (a) be under the shadow lock, and (b) remove write access
+ * from any gueat PT pages we see, as we will be using their contents to
+ * perform shadow updates.
+ * Returns 0 for success or non-zero if the guest pagetables are malformed.
+ * N.B. Finding a not-present entry does not cause a non-zero return code. */
+static inline int
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
+{
+ ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
+
+ perfc_incrc(shadow_guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ /* Get l4e from the top level table */
+ gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
+ /* Walk down to the l3e */
+ if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
+ gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
+ if ( !valid_mfn(gw->l3mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
+ + guest_l3_table_offset(va);
+#else /* PAE only... */
+ /* Get l3e from the top level table */
+ gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
+#endif /* PAE or 64... */
+ /* Walk down to the l2e */
+ if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
+ gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
+ if ( !valid_mfn(gw->l2mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
+ + guest_l2_table_offset(va);
+#else /* 32-bit only... */
+ /* Get l2e from the top level table */
+ gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
+ gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
+#endif /* All levels... */
+
+ if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(*gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. This lets
+ * us reflect l2 changes later without touching the l1s. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1 */
+ if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->eff_l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1e = NULL;
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
+ if ( !valid_mfn(gw->l1mfn) ) return 1;
+ /* This mfn is a pagetable: make sure the guest can't write to it. */
+ if ( guest_op
+ && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
+ flush_tlb_mask(v->domain->domain_dirty_cpumask);
+ gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
+ + guest_l1_table_offset(va);
+ gw->eff_l1e = *gw->l1e;
+ }
+
+ return 0;
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->eff_l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
+}
+
+
+/* Unmap (and reinitialise) a guest walk.
+ * Call this to dispose of any walk filled in by guest_walk_tables() */
+static void unmap_walk(struct vcpu *v, walk_t *gw)
+{
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+ if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
+#endif
+ if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
+#endif
+ if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
+#ifdef DEBUG
+ memset(gw, 0, sizeof(*gw));
+#endif
+}
+
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
+ SHADOW_PRINTK(" l4e=%p\n", gw->l4e);
+ if ( gw->l4e )
+ SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
+#endif /* PAE or 64... */
+ SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
+ SHADOW_PRINTK(" l3e=%p\n", gw->l3e);
+ if ( gw->l3e )
+ SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
+#endif /* All levels... */
+ SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
+ SHADOW_PRINTK(" l2e=%p\n", gw->l2e);
+ if ( gw->l2e )
+ SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
+ SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
+ SHADOW_PRINTK(" l1e=%p\n", gw->l1e);
+ if ( gw->l1e )
+ SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
+ SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
+}
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+/* Lightweight audit: pass all the shadows associated with this guest walk
+ * through the audit mechanisms */
+static void sh_audit_gw(struct vcpu *v, walk_t *gw)
+{
+ mfn_t smfn;
+
+ if ( !(SHADOW_AUDIT_ENABLE) )
+ return;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ if ( valid_mfn(gw->l4mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn,
+ PGC_SH_l4_shadow))) )
+ (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* PAE or 64... */
+ if ( valid_mfn(gw->l3mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn,
+ PGC_SH_l3_shadow))) )
+ (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* All levels... */
+ if ( valid_mfn(gw->l2mfn) )
+ {
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH_l2_shadow))) )
+ (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#if GUEST_PAGING_LEVELS == 3
+ if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn,
+ PGC_SH_l2h_shadow))) )
+ (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#endif
+ }
+ if ( valid_mfn(gw->l1mfn)
+ && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn,
+ PGC_SH_l1_shadow))) )
+ (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
+ else if ( gw->l2e
+ && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
+ && valid_mfn(
+ (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
+ (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
+}
+
+#else
+#define sh_audit_gw(_v, _gw) do {} while(0)
+#endif /* audit code */
+
+
+
+/**************************************************************************/
+/* Function to write to the guest tables, for propagating accessed and
+ * dirty bits from the shadow to the guest.
+ * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
+ * and an operation type. The guest entry is always passed as an l1e:
+ * since we only ever write flags, that's OK.
+ * Returns the new flag bits of the guest entry. */
+
+static u32 guest_set_ad_bits(struct vcpu *v,
+ mfn_t gmfn,
+ guest_l1e_t *ep,
+ unsigned int level,
+ fetch_type_t ft)
+{
+ u32 flags, shflags, bit;
+ struct page_info *pg;
+ int res = 0;
+
+ ASSERT(valid_mfn(gmfn)
+ && (sh_mfn_is_a_page_table(gmfn)
+ || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask)
+ == 0)));
+ ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+ ASSERT(level <= GUEST_PAGING_LEVELS);
+ ASSERT(ft == ft_demand_read || ft == ft_demand_write);
+ ASSERT(shadow_lock_is_acquired(v->domain));
+
+ flags = guest_l1e_get_flags(*ep);
+
+ /* PAE l3s do not have A and D bits */
+ if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
+ return flags;
+
+ /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
+ if ( ft == ft_demand_write
+ && (level == 1 ||
+ (level == 2 && GUEST_PAGING_LEVELS < 4
+ && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
+ {
+ if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED))
+ == (_PAGE_DIRTY | _PAGE_ACCESSED) )
+ return flags; /* Guest already has A and D bits set */
+ flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ perfc_incrc(shadow_ad_update);
+ }
+ else
+ {
+ if ( flags & _PAGE_ACCESSED )
+ return flags; /* Guest already has A bit set */
+ flags |= _PAGE_ACCESSED;
+ perfc_incrc(shadow_a_update);
+ }
+
+ /* Set the bit(s) */
+ sh_mark_dirty(v->domain, gmfn);
+ SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", "
+ "old flags = %#x, new flags = %#x\n",
+ guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
+ *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
+
+ /* May need to propagate this change forward to other kinds of shadow */
+ pg = mfn_to_page(gmfn);
+ if ( !sh_mfn_is_a_page_table(gmfn) )
+ {
+ /* This guest pagetable is not yet shadowed at all. */
+ // MAF: I think this assert is busted... If this gmfn has not yet
+ // been promoted, then it seems perfectly reasonable for there to be
+ // outstanding type refs to it...
+ /* TJD: No. If the gmfn has not been promoted, we must at least
+ * have recognised that it is a pagetable, and pulled write access.
+ * The type count should only be non-zero if it is actually a page
+ * table. The test above was incorrect, though, so I've fixed it. */
+ ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
+ return flags;
+ }
+
+ shflags = pg->shadow_flags & SHF_page_type_mask;
+ while ( shflags )
+ {
+ bit = find_first_set_bit(shflags);
+ ASSERT(shflags & (1u << bit));
+ shflags &= ~(1u << bit);
+ if ( !(pg->shadow_flags & (1u << bit)) )
+ continue;
+ switch ( bit )
+ {
+ case PGC_SH_type_to_index(PGC_SH_l1_shadow):
+ if (level != 1)
+ res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
+ break;
+ case PGC_SH_type_to_index(PGC_SH_l2_shadow):
+ if (level != 2)
+ res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS == 3 /* PAE only */
+ case PGC_SH_type_to_index(PGC_SH_l2h_shadow):
+ if (level != 2)
+ res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+ case PGC_SH_type_to_index(PGC_SH_l3_shadow):
+ if (level != 3)
+ res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
+ break;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ case PGC_SH_type_to_index(PGC_SH_l4_shadow):
+ if (level != 4)
+ res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
+ break;
+#endif
+#endif
+ default:
+ SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple "
+ "modes: A&D bits may be out of sync (flags=%#x).\n",
+ mfn_x(gmfn), pg->shadow_flags);
+ /* XXX Shadows in other modes will not be updated, so will
+ * have their A and D bits out of sync. */
+ }
+ }
+
+ /* We should never need to flush the TLB or recopy PAE entries */
+ ASSERT( res == 0 || res == SHADOW_SET_CHANGED );
+ return flags;
+}
+
+/**************************************************************************/
+/* Functions to compute the correct index into a shadow page, given an
+ * index into the guest page (as returned by guest_get_index()).
+ * This is trivial when the shadow and guest use the same sized PTEs, but
+ * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
+ * PAE- or 64-bit shadows).
+ *
+ * These functions also increment the shadow mfn, when necessary. When PTE
+ * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
+ * page. In this case, we allocate 2 contiguous pages for the shadow L1, and
+ * use simple pointer arithmetic on a pointer to the guest L1e to figure out
+ * which shadow page we really want. Similarly, when PTE sizes are
+ * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
+ * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
+ * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
+ * space.)
+ *
+ * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
+ * of shadow (to store both the shadow, and the info that would normally be
+ * stored in page_info fields). This arrangement allows the shadow and the
+ * "page_info" fields to always be stored in the same page (in fact, in
+ * the same cache line), avoiding an extra call to map_domain_page().
+ */
+
+static inline u32
+guest_index(void *ptr)
+{
+ return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
+}
+
+static inline u32
+shadow_l1_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
+ return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
+#else
+ return guest_index;
+#endif
+}
+
+static inline u32
+shadow_l2_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+ // Because we use 2 shadow l2 entries for each guest entry, the number of
+ // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We multiple by two to get the index of the first of the two entries
+ // used to shadow the specified guest entry.
+ return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
+#else
+ return guest_index;
+#endif
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+
+static inline u32
+shadow_l3_index(mfn_t *smfn, u32 guest_index)
+{
+#if GUEST_PAGING_LEVELS == 3
+ u32 group_id;
+
+ // Because we use twice the space in L3 shadows as was consumed in guest
+ // L3s, the number of guest entries per shadow page is
+ // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not*
+ // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
+ //
+ *smfn = _mfn(mfn_x(*smfn) +
+ (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+ // We store PAE L3 shadows in groups of 4, alternating shadows and
+ // pae_l3_bookkeeping structs. So the effective shadow index is
+ // the the group_id * 8 + the offset within the group.
+ //
+ guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
+ group_id = guest_index / 4;
+ return (group_id * 8) + (guest_index % 4);
+#else
+ return guest_index;
+#endif
+}
+
+#endif // GUEST_PAGING_LEVELS >= 3
+
+#if GUEST_PAGING_LEVELS >= 4
+
+static inline u32
+shadow_l4_index(mfn_t *smfn, u32 guest_index)
+{
+ return guest_index;
+}
+
+#endif // GUEST_PAGING_LEVELS >= 4
+
+
+/**************************************************************************/
+/* Functions which compute shadow entries from their corresponding guest
+ * entries.
+ *
+ * These are the "heart" of the shadow code.
+ *
+ * There are two sets of these: those that are called on demand faults (read
+ * faults and write faults), and those that are essentially called to
+ * "prefetch" (or propagate) entries from the guest into the shadow. The read
+ * fault and write fault are handled as two separate cases for L1 entries (due
+ * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
+ * into the respective demand_fault functions.
+ */
+
+#define CHECK(_cond) \
+do { \
+ if (unlikely(!(_cond))) \
+ { \
+ printk("%s %s %d ASSERTION (%s) FAILED\n", \
+ __func__, __FILE__, __LINE__, #_cond); \
+ return -1; \
+ } \
+} while (0);
+
+// The function below tries to capture all of the flag manipulation for the
+// demand and propagate functions into one place.
+//
+static always_inline u32
+sh_propagate_flags(struct vcpu *v, mfn_t target_mfn,
+ u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn,
+ int mmio, int level, fetch_type_t ft)
+{
+ struct domain *d = v->domain;
+ u32 pass_thru_flags;
+ u32 sflags;
+
+ // XXX -- might want to think about PAT support for HVM guests...
+
+#ifndef NDEBUG
+ // MMIO can only occur from L1e's
+ //
+ if ( mmio )
+ CHECK(level == 1);
+
+ // We should always have a pointer to the guest entry if it's a non-PSE
+ // non-MMIO demand access.
+ if ( ft & FETCH_TYPE_DEMAND )
+ CHECK(guest_entry_ptr || level == 1);
+#endif
+
+ // A not-present guest entry has a special signature in the shadow table,
+ // so that we do not have to consult the guest tables multiple times...
+ //
+ if ( unlikely(!(gflags & _PAGE_PRESENT)) )
+ return _PAGE_SHADOW_GUEST_NOT_PRESENT;
+
+ // Must have a valid target_mfn, unless this is mmio, or unless this is a
+ // prefetch. In the case of a prefetch, an invalid mfn means that we can
+ // not usefully shadow anything, and so we return early.
+ //
+ if ( !valid_mfn(target_mfn) )
+ {
+ CHECK((ft == ft_prefetch) || mmio);
+ if ( !mmio )
+ return 0;
+ }
+
+ // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
+ //
+ if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
+ pass_thru_flags = _PAGE_PRESENT;
+ else
+ {
+ pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
+ _PAGE_RW | _PAGE_PRESENT);
+ if ( guest_supports_nx(v) )
+ pass_thru_flags |= _PAGE_NX_BIT;
+ }
+
+ // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
+ // L3e's; they are all implied. So we emulate them here.
+ //
+ if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
+ gflags = pass_thru_flags;
+
+ // Propagate bits from the guest to the shadow.
+ // Some of these may be overwritten, below.
+ // Since we know the guest's PRESENT bit is set, we also set the shadow's
+ // SHADOW_PRESENT bit.
+ //
+ sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
+
+ // Copy the guest's RW bit into the SHADOW_RW bit.
+ //
+ if ( gflags & _PAGE_RW )
+ sflags |= _PAGE_SHADOW_RW;
+
+ // Set the A&D bits for higher level shadows.
+ // Higher level entries do not, strictly speaking, have dirty bits, but
+ // since we use shadow linear tables, each of these entries may, at some
+ // point in time, also serve as a shadow L1 entry.
+ // By setting both the A&D bits in each of these, we eliminate the burden
+ // on the hardware to update these bits on initial accesses.
+ //
+ if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
+ sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
+
+
+ // Set the A and D bits in the guest entry, if we need to.
+ if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
+ gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
+
+ // If the A or D bit has not yet been set in the guest, then we must
+ // prevent the corresponding kind of access.
+ //
+ if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
+ !(gflags & _PAGE_ACCESSED)) )
+ sflags &= ~_PAGE_PRESENT;
+
+ /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
+ if ( unlikely( ((level == 1)
+ || ((level == 2) && (GUEST_PAGING_LEVELS < 4)
+ && guest_supports_superpages(v) &&
+ (gflags & _PAGE_PSE)))
+ && !(gflags & _PAGE_DIRTY)) )
+ sflags &= ~_PAGE_RW;
+
+ // MMIO caching
+ //
+ // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
+ // to cache the fact that this entry is in MMIO space.
+ //
+ if ( (level == 1) && mmio )
+ {
+ sflags &= ~(_PAGE_PRESENT);
+ sflags |= _PAGE_SHADOW_MMIO;
+ }
+ else
+ {
+ // shadow_mode_log_dirty support
+ //
+ // Only allow the guest write access to a page a) on a demand fault,
+ // or b) if the page is already marked as dirty.
+ //
+ if ( unlikely((level == 1) &&
+ !(ft & FETCH_TYPE_WRITE) &&
+ shadow_mode_log_dirty(d) &&
+ !sh_mfn_is_dirty(d, target_mfn)) )
+ {
+ sflags &= ~_PAGE_RW;
+ }
+
+ // protect guest page tables
+ //
+ if ( unlikely((level == 1) &&
+ sh_mfn_is_a_page_table(target_mfn)) )
+ {
+ if ( shadow_mode_trap_reads(d) )
+ {
+ // if we are trapping both reads & writes, then mark this page
+ // as not present...
+ //
+ sflags &= ~_PAGE_PRESENT;
+ }
+ else
+ {
+ // otherwise, just prevent any writes...
+ //
+ sflags &= ~_PAGE_RW;
+ }
+ }
+ }
+
+ return sflags;
+}
+
+#undef CHECK
+
+#if GUEST_PAGING_LEVELS >= 4
+static void
+l4e_propagate_from_guest(struct vcpu *v,
+ guest_l4e_t *gl4e,
+ mfn_t gl4mfn,
+ mfn_t sl3mfn,
+ shadow_l4e_t *sl4p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l4e_get_flags(*gl4e);
+ u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
+ gl4mfn, 0, 4, ft);
+
+ *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl4e->l4, sl4p->l4);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static void
+l3e_propagate_from_guest(struct vcpu *v,
+ guest_l3e_t *gl3e,
+ mfn_t gl3mfn,
+ mfn_t sl2mfn,
+ shadow_l3e_t *sl3p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l3e_get_flags(*gl3e);
+ u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
+ gl3mfn, 0, 3, ft);
+
+ *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl3e->l3, sl3p->l3);
+ ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static void
+l2e_propagate_from_guest(struct vcpu *v,
+ guest_l2e_t *gl2e,
+ mfn_t gl2mfn,
+ mfn_t sl1mfn,
+ shadow_l2e_t *sl2p,
+ fetch_type_t ft)
+{
+ u32 gflags = guest_l2e_get_flags(*gl2e);
+ u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e,
+ gl2mfn, 0, 2, ft);
+
+ *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
+ fetch_type_names[ft], gl2e->l2, sl2p->l2);
+ ASSERT(sflags != -1);
+}
+
+static inline int
+l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_read);
+
+ if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline int
+l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+ int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+ struct domain *d = v->domain;
+ u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+ mmio, 1, ft_demand_write);
+
+ sh_mark_dirty(d, gmfn);
+
+ if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
+ {
+ // emulation required!
+ *sl1p = shadow_l1e_empty();
+ return 1;
+ }
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+ return 0;
+}
+
+static inline void
+l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+ int mmio)
+{
+ gfn_t gfn = guest_l1e_get_gfn(gl1e);
+ mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
+ u32 gflags = guest_l1e_get_flags(gl1e);
+ u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN),
+ mmio, 1, ft_prefetch);
+
+ *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+ SHADOW_DEBUG(PROPAGATE,
+ "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
+ gl1e.l1, sl1p->l1);
+
+ ASSERT(sflags != -1);
+}
+
+
+/**************************************************************************/
+/* These functions update shadow entries (and do bookkeeping on the shadow
+ * tables they are in). It is intended that they are the only
+ * functions which ever write (non-zero) data onto a shadow page.
+ *
+ * They return a set of flags:
+ * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow.
+ * SHADOW_SET_FLUSH -- the caller must cause a TLB flush.
+ * SHADOW_SET_ERROR -- the input is not a valid entry (for example, if
+ * shadow_get_page_from_l1e() fails).
+ * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
+ * copies of their PAE L3 entries re-copied.
+ */
+
+static inline void safe_write_entry(void *dst, void *src)
+/* Copy one PTE safely when processors might be running on the
+ * destination pagetable. This does *not* give safety against
+ * concurrent writes (that's what the shadow lock is for), just
+ * stops the hardware picking up partially written entries. */
+{
+ volatile unsigned long *d = dst;
+ unsigned long *s = src;
+ ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
+#if CONFIG_PAGING_LEVELS == 3
+ /* In PAE mode, pagetable entries are larger
+ * than machine words, so won't get written atomically. We need to make
+ * sure any other cpu running on these shadows doesn't see a
+ * half-written entry. Do this by marking the entry not-present first,
+ * then writing the high word before the low word. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
+ d[0] = 0;
+ d[1] = s[1];
+ d[0] = s[0];
+#else
+ /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
+ * which will be an atomic write, since the entry is aligned. */
+ BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
+ *d = *s;
+#endif
+}
+
+
+static inline void
+shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+/* This function does the actual writes to shadow pages.
+ * It must not be called directly, since it doesn't do the bookkeeping
+ * that shadow_set_l*e() functions do. */
+{
+ shadow_l1e_t *dst = d;
+ shadow_l1e_t *src = s;
+ void *map = NULL;
+ int i;
+
+ /* Because we mirror access rights at all levels in the shadow, an
+ * l2 (or higher) entry with the RW bit cleared will leave us with
+ * no write access through the linear map.
+ * We detect that by writing to the shadow with copy_to_user() and
+ * using map_domain_page() to get a writeable mapping if we need to. */
+ if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
+ {
+ perfc_incrc(shadow_linear_map_failed);
+ map = sh_map_domain_page(mfn);
+ ASSERT(map != NULL);
+ dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
+ }
+
+
+ for ( i = 0; i < entries; i++ )
+ safe_write_entry(dst++, src++);
+
+ if ( map != NULL ) sh_unmap_domain_page(map);
+
+ /* XXX TODO:
+ * Update min/max field in page_info struct of this mfn */
+}
+
+static inline int
+perms_strictly_increased(u32 old_flags, u32 new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+ u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+ /* Flip the NX bit, since it's the only one that decreases rights;
+ * we calculate as if it were an "X" bit. */
+ of ^= _PAGE_NX_BIT;
+ nf ^= _PAGE_NX_BIT;
+ /* If the changed bits are all set in the new flags, then rights strictly
+ * increased between old and new. */
+ return ((of | (of ^ nf)) == nf);
+}
+
+static int inline
+shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ int res;
+ mfn_t mfn;
+ struct domain *owner;
+ shadow_l1e_t sanitized_sl1e =
+ shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
+
+ //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
+ //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+ if ( !shadow_mode_refcounts(d) )
+ return 1;
+
+ res = get_page_from_l1e(sanitized_sl1e, d);
+
+ // If a privileged domain is attempting to install a map of a page it does
+ // not own, we let it succeed anyway.
+ //
+ if ( unlikely(!res) &&
+ IS_PRIV(d) &&
+ !shadow_mode_translate(d) &&
+ valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
+ (owner = page_get_owner(mfn_to_page(mfn))) &&
+ (d != owner) )
+ {
+ res = get_page_from_l1e(sanitized_sl1e, owner);
+ SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
+ "which is owned by domain %d: %s\n",
+ d->domain_id, mfn_x(mfn), owner->domain_id,
+ res ? "success" : "failed");
+ }
+
+ if ( unlikely(!res) )
+ {
+ perfc_incrc(shadow_get_page_fail);
+ SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
+ }
+
+ return res;
+}
+
+static void inline
+shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+ if ( !shadow_mode_refcounts(d) )
+ return;
+
+ put_page_from_l1e(sl1e, d);
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+static int shadow_set_l4e(struct vcpu *v,
+ shadow_l4e_t *sl4e,
+ shadow_l4e_t new_sl4e,
+ mfn_t sl4mfn)
+{
+ int flags = 0;
+ shadow_l4e_t old_sl4e;
+ paddr_t paddr;
+ ASSERT(sl4e != NULL);
+ old_sl4e = *sl4e;
+
+ if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl4e) & ~PAGE_MASK));
+
+ if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
+ if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
+ || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
+ shadow_l4e_get_flags(new_sl4e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl3mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if GUEST_PAGING_LEVELS >= 3
+static int shadow_set_l3e(struct vcpu *v,
+ shadow_l3e_t *sl3e,
+ shadow_l3e_t new_sl3e,
+ mfn_t sl3mfn)
+{
+ int flags = 0;
+ shadow_l3e_t old_sl3e;
+ paddr_t paddr;
+ ASSERT(sl3e != NULL);
+ old_sl3e = *sl3e;
+
+ if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl3e) & ~PAGE_MASK));
+
+ if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We wrote a guest l3e in a PAE pagetable. This table is copied in
+ * the linear pagetable entries of its l2s, and may also be copied
+ * to a low memory location to make it fit in CR3. Report that we
+ * need to resync those copies (we can't wait for the guest to flush
+ * the TLB because it might be an increase in rights). */
+ {
+ struct vcpu *vcpu;
+
+ struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
+ for_each_vcpu(v->domain, vcpu)
+ {
+ if (info->vcpus & (1 << vcpu->vcpu_id))
+ {
+ // Remember that this flip/update needs to occur.
+ vcpu->arch.shadow.pae_flip_pending = 1;
+ flags |= SHADOW_SET_L3PAE_RECOPY;
+ }
+ }
+ }
+#endif
+
+ if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
+ if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
+ !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e),
+ shadow_l3e_get_flags(new_sl3e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl2mfn, paddr);
+ }
+ return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static int shadow_set_l2e(struct vcpu *v,
+ shadow_l2e_t *sl2e,
+ shadow_l2e_t new_sl2e,
+ mfn_t sl2mfn)
+{
+ int flags = 0;
+ shadow_l2e_t old_sl2e;
+ paddr_t paddr;
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ /* In 2-on-3 we work with pairs of l2es pointing at two-page
+ * shadows. Reference counting and up-pointers track from the first
+ * page of the shadow to the first l2e, so make sure that we're
+ * working with those:
+ * Align the pointer down so it's pointing at the first of the pair */
+ sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
+ /* Align the mfn of the shadow entry too */
+ new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
+#endif
+
+ ASSERT(sl2e != NULL);
+ old_sl2e = *sl2e;
+
+ if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
+
+ paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | (((unsigned long)sl2e) & ~PAGE_MASK));
+
+ if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
+ }
+
+ /* Write the new entry */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+ {
+ shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
+ /* The l1 shadow is two pages long and need to be pointed to by
+ * two adjacent l1es. The pair have the same flags, but point
+ * at odd and even MFNs */
+ ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
+ pair[1].l2 |= (1<<PAGE_SHIFT);
+ shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+ }
+#else /* normal case */
+ shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+#endif
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
+ if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
+ !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e),
+ shadow_l2e_get_flags(new_sl2e)) )
+ {
+ flags |= SHADOW_SET_FLUSH;
+ }
+ sh_put_ref(v, osl1mfn, paddr);
+ }
+ return flags;
+}
+
+static int shadow_set_l1e(struct vcpu *v,
+ shadow_l1e_t *sl1e,
+ shadow_l1e_t new_sl1e,
+ mfn_t sl1mfn)
+{
+ int flags = 0;
+ struct domain *d = v->domain;
+ shadow_l1e_t old_sl1e;
+ ASSERT(sl1e != NULL);
+
+ old_sl1e = *sl1e;
+
+ if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
+
+ if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT )
+ {
+ /* About to install a new reference */
+ if ( shadow_mode_refcounts(d) ) {
+ if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 )
+ {
+ /* Doesn't look like a pagetable. */
+ flags |= SHADOW_SET_ERROR;
+ new_sl1e = shadow_l1e_empty();
+ }
+ }
+ }
+
+ /* Write the new entry */
+ shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+ flags |= SHADOW_SET_CHANGED;
+
+ if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
+ {
+ /* We lost a reference to an old mfn. */
+ /* N.B. Unlike higher-level sets, never need an extra flush
+ * when writing an l1e. Because it points to the same guest frame
+ * as the guest l1e did, it's the guest's responsibility to
+ * trigger a flush later. */
+ if ( shadow_mode_refcounts(d) )
+ {
+ shadow_put_page_from_l1e(old_sl1e, d);
+ }
+ }
+ return flags;
+}
+
+
+/**************************************************************************/
+/* These functions take a vcpu and a virtual address, and return a pointer
+ * to the appropriate level N entry from the shadow tables.
+ * If the necessary tables are not present in the shadow, they return NULL. */
+
+/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has
+ * more levels than the guest, the upper levels are always fixed and do not
+ * reflect any information from the guest, so we do not use these functions
+ * to access them. */
+
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t *
+shadow_get_l4e(struct vcpu *v, unsigned long va)
+{
+ /* Reading the top level table is always valid. */
+ return sh_linear_l4_table(v) + shadow_l4_linear_offset(va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t *
+shadow_get_l3e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ /* Get the l4 */
+ shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
+ ASSERT(sl4e != NULL);
+ if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
+ /* l4 was present; OK to get the l3 */
+ return sh_linear_l3_table(v) + shadow_l3_linear_offset(va);
+#else /* PAE... */
+ /* Top level is always mapped */
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable) +
shadow_l3_linear_offset(va);
+#endif
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t *
+shadow_get_l2e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */
+ /* Get the l3 */
+ shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
+ if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
+ /* l3 was present; OK to get the l2 */
+#endif
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(va);
+}
+
+
+#if 0 // avoid the compiler warning for now...
+
+static shadow_l1e_t *
+shadow_get_l1e(struct vcpu *v, unsigned long va)
+{
+ /* Get the l2 */
+ shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
+ if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
+ return NULL;
+ ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
+ /* l2 was present; OK to get the l1 */
+ return sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+}
+
+#endif
+
+
+/**************************************************************************/
+/* Macros to walk pagetables. These take the shadow of a pagetable and
+ * walk every "interesting" entry. That is, they don't touch Xen mappings,
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every
+ * second entry (since pairs of entries are managed together). For multi-page
+ * shadows they walk all pages.
+ *
+ * Arguments are an MFN, the variable to point to each entry, a variable
+ * to indicate that we are done (we will shortcut to the end of the scan
+ * when _done != 0), a variable to indicate that we should avoid Xen mappings,
+ * and the code.
+ *
+ * WARNING: These macros have side-effects. They change the values of both
+ * the pointer and the MFN. */
+
+static inline void increment_ptr_to_guest_entry(void *ptr)
+{
+ if ( ptr )
+ {
+ guest_l1e_t **entry = ptr;
+ (*entry)++;
+ }
+}
+
+/* All kinds of l1: touch all entries */
+#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \
+ ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l1_shadow \
+ || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_fl1_shadow); \
+ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl1e) = _sp + _i; \
+ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl1p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+do { \
+ int __done = 0; \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+ _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \
+ if ( !__done ) \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \
+ ({ (__done = _done); }), _code); \
+} while (0)
+#else /* Everything else; l1 shadows are only one page */
+#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \
+ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+
+/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i, _j, __done = 0; \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_32_shadow); \
+ for ( _j = 0; _j < 4 && !__done; _j++ ) \
+ { \
+ shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \
+ if ( (!(_xen)) \
+ || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( (__done = (_done)) ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+ _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 2
+
+/* 32-bit on 32-bit: avoid Xen entries */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_32_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || \
+ (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 3
+
+/* PAE: if it's an l2h, don't touch Xen mappings */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_pae_shadow \
+ || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2h_pae_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ if ( (!(_xen)) \
+ || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ != PGC_SH_l2h_pae_shadow) \
+ || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \
+ < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#else
+
+/* 64-bit l2: touch all entries */
+#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \
+ ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l2_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl2e) = _sp + _i; \
+ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl2p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif /* different kinds of l2 */
+
+#if GUEST_PAGING_LEVELS == 3
+
+/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
+#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ for ( _i = 0; _i < 4; _i++ ) \
+ { \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ _sl3e++; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+} while (0)
+
+/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
+#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i, _j, _k, __done = 0; \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l3_pae_shadow); \
+ /* The subshadows are split, 64 on each page of the shadow */ \
+ for ( _j = 0; _j < 2 && !__done; _j++ ) \
+ { \
+ void *_sp = sh_map_domain_page(_sl3mfn); \
+ for ( _i = 0; _i < 64; _i++ ) \
+ { \
+ /* Every second 32-byte region is a bookkeeping entry */ \
+ _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \
+ if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \
+ SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, \
+ ({ __done = (_done); __done; }), \
+ _code); \
+ else \
+ for ( _k = 0 ; _k < 4 ; _k++ ) \
+ increment_ptr_to_guest_entry(_gl3p); \
+ if ( __done ) break; \
+ } \
+ sh_unmap_domain_page(_sp); \
+ _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \
+ } \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 4
+
+/* 64-bit l3: touch all entries */
+#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \
+do { \
+ int _i; \
+ shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \
+ ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l3_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ (_sl3e) = _sp + _i; \
+ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ increment_ptr_to_guest_entry(_gl3p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+/* 64-bit l4: avoid Xen mappings */
+#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \
+do { \
+ int _i; \
+ shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \
+ ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask) \
+ == PGC_SH_l4_64_shadow); \
+ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \
+ { \
+ if ( (!(_xen)) || is_guest_l4_slot(_i) ) \
+ { \
+ (_sl4e) = _sp + _i; \
+ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \
+ {_code} \
+ if ( _done ) break; \
+ } \
+ increment_ptr_to_guest_entry(_gl4p); \
+ } \
+ unmap_shadow_page(_sp); \
+} while (0)
+
+#endif
+
+
+
+/**************************************************************************/
+/* Functions to install Xen mappings and linear mappings in shadow pages */
+
+static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
+
+// XXX -- this function should probably be moved to shadow-common.c, but that
+// probably wants to wait until the shadow types have been moved from
+// shadow-types.h to shadow-private.h
+//
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l4e_t *sl4e;
+
+ sl4e = sh_map_domain_page(sl4mfn);
+ ASSERT(sl4e != NULL);
+ ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+ ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
+ shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow_mode_translate(v->domain) )
+ {
+ /* install domain-specific P2M table */
+ sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh_unmap_domain_page(sl4e);
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+// For 3-on-3 PV guests, we need to make sure the xen mappings are in
+// place, which means that we need to populate the l2h entry in the l3
+// table.
+
+void sh_install_xen_entries_in_l2h(struct vcpu *v,
+ mfn_t sl2hmfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh_map_domain_page(sl2hmfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
+ &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* We don't set up a linear mapping here because we can't until this
+ * l2h is installed in an l3e. sh_update_linear_entries() handles
+ * the linear mappings when the l3 is loaded. */
+
+ if ( shadow_mode_translate(d) )
+ {
+ /* Install the domain-specific p2m table */
+ l3_pgentry_t *p2m;
+ ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+ p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+ for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
+ {
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
+ shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
+ __PAGE_HYPERVISOR);
+ }
+ sh_unmap_domain_page(p2m);
+ }
+
+ sh_unmap_domain_page(sl2e);
+}
+
+void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
+{
+ shadow_l3e_t *sl3e;
+ guest_l3e_t *gl3e = v->arch.guest_vtable;
+ shadow_l3e_t new_sl3e;
+ gfn_t l2gfn;
+ mfn_t l2gmfn, l2smfn;
+ int r;
+
+ ASSERT(!shadow_mode_external(v->domain));
+ ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
+ l2gfn = guest_l3e_get_gfn(gl3e[3]);
+ l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn));
+ l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow);
+ if ( !valid_mfn(l2smfn) )
+ {
+ l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow);
+ }
+ l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
+ ft_prefetch);
+ sl3e = sh_map_domain_page(sl3mfn);
+ r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
+ sh_unmap_domain_page(sl3e);
+}
+#endif
+
+
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
+{
+ struct domain *d = v->domain;
+ shadow_l2e_t *sl2e;
+ int i;
+
+ sl2e = sh_map_domain_page(sl2mfn);
+ ASSERT(sl2e != NULL);
+ ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+
+ /* Copy the common Xen mappings from the idle domain */
+ memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
+ &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
+ L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+ /* Install the per-domain mappings for this domain */
+ for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+ sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+ shadow_l2e_from_mfn(
+ page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+ __PAGE_HYPERVISOR);
+
+ /* Linear mapping */
+ sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
+ sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
+
+ if ( shadow_mode_translate(d) )
+ {
+ /* install domain-specific P2M table */
+ sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
+ shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+ __PAGE_HYPERVISOR);
+ }
+
+ sh_unmap_domain_page(sl2e);
+}
+#endif
+
+
+
+
+
+/**************************************************************************/
+/* Create a shadow of a given guest page.
+ */
+static mfn_t
+sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+{
+ mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
+ SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
+ mfn_x(gmfn), shadow_type, mfn_x(smfn));
+
+ if ( shadow_type != PGC_SH_guest_root_type )
+ /* Lower-level shadow, not yet linked form a higher level */
+ mfn_to_page(smfn)->up = 0;
+
+ // Create the Xen mappings...
+ if ( !shadow_mode_external(v->domain) )
+ {
+ switch (shadow_type)
+ {
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+ case PGC_SH_l4_shadow:
+ sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+ case PGC_SH_l3_shadow:
+ sh_install_xen_entries_in_l3(v, gmfn, smfn); break;
+ case PGC_SH_l2h_shadow:
+ sh_install_xen_entries_in_l2h(v, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+ case PGC_SH_l2_shadow:
+ sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
+#endif
+ default: /* Do nothing */ break;
+ }
+ }
+
+ shadow_promote(v, gmfn, shadow_type);
+ set_shadow_status(v, gmfn, shadow_type, smfn);
+
+ return smfn;
+}
+
+/* Make a splintered superpage shadow */
+static mfn_t
+make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+{
+ mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow,
+ (unsigned long) gfn_x(gfn));
+
+ SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
+ gfn_x(gfn), mfn_x(smfn));
+
+ set_fl1_shadow_status(v, gfn, smfn);
+ return smfn;
+}
+
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+mfn_t
+sh_make_monitor_table(struct vcpu *v)
+{
+
+ ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
+
+#if CONFIG_PAGING_LEVELS == 4
+ {
+ struct domain *d = v->domain;
+ mfn_t m4mfn;
+ m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m4mfn)->shadow_flags = 4;
+#if SHADOW_PAGING_LEVELS < 4
+ // Install a monitor l3 table in slot 0 of the l4 table.
+ // This is used for shadow linear maps.
+ {
+ mfn_t m3mfn;
+ l4_pgentry_t *l4e;
+ m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ mfn_to_page(m3mfn)->shadow_flags = 3;
+ l4e = sh_map_domain_page(m4mfn);
+ l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(l4e);
+ }
+#endif /* SHADOW_PAGING_LEVELS < 4 */
+ return m4mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m3mfn, m2mfn;
+ l3_pgentry_t *l3e;
+ l2_pgentry_t *l2e;
+ int i;
+
+ m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ /* Remember the level of this table */
+ mfn_to_page(m3mfn)->shadow_flags = 3;
+
+ // Install a monitor l2 table in slot 3 of the l3 table.
+ // This is used for all Xen entries, including linear maps
+ m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ mfn_to_page(m2mfn)->shadow_flags = 2;
+ l3e = sh_map_domain_page(m3mfn);
+ l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
+ sh_install_xen_entries_in_l2h(v, m2mfn);
+ /* Install the monitor's own linear map */
+ l2e = sh_map_domain_page(m2mfn);
+ for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+ l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
+ (l3e_get_flags(l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR)
+ : l2e_empty();
+ sh_unmap_domain_page(l2e);
+ sh_unmap_domain_page(l3e);
+
+ SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
+ return m3mfn;
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ {
+ struct domain *d = v->domain;
+ mfn_t m2mfn;
+ m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
+ sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
+ /* Remember the level of this table */
+ mfn_to_page(m2mfn)->shadow_flags = 2;
+ return m2mfn;
+ }
+
+#else
+#error this should not happen
+#endif /* CONFIG_PAGING_LEVELS */
+}
+#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
+
+/**************************************************************************/
+/* These functions also take a virtual address and return the level-N
+ * shadow table mfn and entry, but they create the shadow pagetables if
+ * they are needed. The "demand" argument is non-zero when handling
+ * a demand fault (so we know what to do about accessed bits &c).
+ * If the necessary tables are not present in the guest, they return NULL. */
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl4mfn)
+{
+ /* There is always a shadow of the top level table. Get it. */
+ *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* Reading the top level table is always valid. */
+ return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl3mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+ mfn_t sl4mfn;
+ shadow_l4e_t *sl4e;
+ if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
+ /* Get the l4e */
+ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
+ ASSERT(sl4e != NULL);
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ *sl3mfn = shadow_l4e_get_mfn(*sl4e);
+ ASSERT(valid_mfn(*sl3mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l4e_t new_sl4e;
+ /* No l3 shadow installed: find and install it. */
+ *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow);
+ if ( !valid_mfn(*sl3mfn) )
+ {
+ /* No l3 shadow of this page exists at all: make one. */
+ *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow);
+ }
+ /* Install the new sl3 table in the sl4e */
+ l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn,
+ *sl3mfn, &new_sl4e, ft);
+ r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
+#else /* PAE... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the shadow l3 table is in an 8k
+ * shadow and we need to return the right mfn of the pair. This call
+ * will set it for us as a side-effect. */
+ (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
+ ASSERT(v->arch.shadow_vtable);
+ return ((shadow_l3e_t *)v->arch.shadow_vtable)
+ + shadow_l3_table_offset(gw->va);
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl2mfn,
+ fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ shadow_l3e_t *sl3e;
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ /* Get the l3e */
+ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+ ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ {
+ *sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ ASSERT(valid_mfn(*sl2mfn));
+ }
+ else
+ {
+ int r;
+ shadow_l3e_t new_sl3e;
+ /* No l2 shadow installed: find and install it. */
+ *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow);
+ if ( !valid_mfn(*sl2mfn) )
+ {
+ /* No l2 shadow of this page exists at all: make one. */
+ *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow);
+ }
+ /* Install the new sl2 table in the sl3e */
+ l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn,
+ *sl2mfn, &new_sl3e, ft);
+ r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+#if GUEST_PAGING_LEVELS == 3
+ /* Need to sync up the linear maps, as we are about to use them */
+ ASSERT( r & SHADOW_SET_L3PAE_RECOPY );
+ sh_pae_recopy(v->domain);
+#endif
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#else /* 32bit... */
+ /* There is always a shadow of the top level table. Get it. */
+ *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
+ /* This next line is important: the guest l2 has a 16k
+ * shadow, we need to return the right mfn of the four. This
+ * call will set it for us as a side-effect. */
+ (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
+ /* Reading the top level table is always valid. */
+ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#endif
+}
+
+
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
+ walk_t *gw,
+ mfn_t *sl1mfn,
+ fetch_type_t ft)
+{
+ mfn_t sl2mfn;
+ shadow_l2e_t *sl2e;
+
+ /* Get the l2e */
+ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+ if ( sl2e == NULL ) return NULL;
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ {
+ *sl1mfn = shadow_l2e_get_mfn(*sl2e);
+ ASSERT(valid_mfn(*sl1mfn));
+ }
+ else
+ {
+ shadow_l2e_t new_sl2e;
+ int r, flags = guest_l2e_get_flags(*gw->l2e);
+ /* No l1 shadow installed: find and install it. */
+ if ( !(flags & _PAGE_PRESENT) )
+ return NULL; /* No guest page. */
+ if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) )
+ {
+ /* Splintering a superpage */
+ gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
+ *sl1mfn = get_fl1_shadow_status(v, l2gfn);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No fl1 shadow of this superpage exists at all: make one. */
+ *sl1mfn = make_fl1_shadow(v, l2gfn);
+ }
+ }
+ else
+ {
+ /* Shadowing an actual guest l1 table */
+ if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+ *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow);
+ if ( !valid_mfn(*sl1mfn) )
+ {
+ /* No l1 shadow of this page exists at all: make one. */
+ *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow);
+ }
+ }
+ /* Install the new sl1 table in the sl2e */
+ l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn,
+ *sl1mfn, &new_sl2e, ft);
+ r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
+ ASSERT((r & SHADOW_SET_FLUSH) == 0);
+ /* This next line is important: in 32-on-PAE and 32-on-64 modes,
+ * the guest l1 table has an 8k shadow, and we need to return
+ * the right mfn of the pair. This call will set it for us as a
+ * side-effect. (In all other cases, it's a no-op and will be
+ * compiled out.) */
+ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
+ }
+ /* Now follow it down a level. Guaranteed to succeed. */
+ return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
+}
+
+
+
+/**************************************************************************/
+/* Destructors for shadow tables:
+ * Unregister the shadow, decrement refcounts of any entries present in it,
+ * and release the memory.
+ *
+ * N.B. These destructors do not clear the contents of the shadows.
+ * This allows us to delay TLB shootdowns until the page is being reused.
+ * See shadow_alloc() and shadow_free() for how this is handled.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l4e_t *sl4e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl4mfn;
+ int xen_mappings;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l4_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+
+ /* Decrement refcounts of all the old entries */
+ xen_mappings = (!shadow_mode_external(v->domain));
+ sl4mfn = smfn;
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT )
+ {
+ sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
+ (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl4e & ~PAGE_MASK));
+ }
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+#endif
+
+#if GUEST_PAGING_LEVELS >= 3
+void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l3e_t *sl3e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl3mfn;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l3_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 3
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl3mfn = smfn;
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
+ (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl3e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+#endif
+
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh_destroy_l3_subshadow(struct vcpu *v,
+ shadow_l3e_t *sl3e)
+/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
+{
+ int i;
+ ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0);
+ for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ )
+ if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
+ maddr_from_mapped_domain_page(sl3e));
+}
+#endif
+
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
+/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
+{
+ int i, j;
+ struct pae_l3_bookkeeping *bk;
+
+ ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_l3_pae_shadow);
+ /* The subshadows are split, 64 on each page of the shadow */
+ for ( i = 0; i < 2; i++ )
+ {
+ void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i));
+ for ( j = 0; j < 64; j++ )
+ {
+ /* Every second 32-byte region is a bookkeeping entry */
+ bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
+ if ( bk->pinned )
+ sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
+ /* Check whether we've just freed the whole shadow */
+ if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 )
+ {
+ sh_unmap_domain_page(p);
+ return;
+ }
+ }
+ sh_unmap_domain_page(p);
+ }
+}
+#endif
+
+void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+{
+ shadow_l2e_t *sl2e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+ mfn_t gmfn, sl2mfn;
+ int xen_mappings;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l2_shadow
+ || t == PGC_SH_l2h_pae_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 2
+ /* Take this shadow off the list of root shadows */
+ list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+ /* Decrement refcounts of all the old entries */
+ sl2mfn = smfn;
+ xen_mappings = (!shadow_mode_external(v->domain) &&
+ ((GUEST_PAGING_LEVELS == 2) ||
+ ((GUEST_PAGING_LEVELS == 3) &&
+ (t == PGC_SH_l2h_pae_shadow))));
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT )
+ sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
+ (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+ | ((unsigned long)sl2e & ~PAGE_MASK));
+ });
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+
+void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+{
+ struct domain *d = v->domain;
+ shadow_l1e_t *sl1e;
+ u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
+
+ SHADOW_DEBUG(DESTROY_SHADOW,
+ "%s(%05lx)\n", __func__, mfn_x(smfn));
+ ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow);
+
+ /* Record that the guest page isn't shadowed any more (in this type) */
+ if ( t == PGC_SH_fl1_shadow )
+ {
+ gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_fl1_shadow_status(v, gfn, smfn);
+ }
+ else
+ {
+ mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+ delete_shadow_status(v, gmfn, t, smfn);
+ shadow_demote(v, gmfn, t);
+ }
+
+ if ( shadow_mode_refcounts(d) )
+ {
+ /* Decrement refcounts of all the old entries */
+ mfn_t sl1mfn = smfn;
+ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
+ if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT )
+ shadow_put_page_from_l1e(*sl1e, d);
+ });
+ }
+
+ /* Put the memory back in the pool */
+ shadow_free(v->domain, smfn);
+}
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+ struct domain *d = v->domain;
+ ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_monitor_table);
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
+ /* Need to destroy the l3 monitor page in slot 0 too */
+ {
+ l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
+ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+ shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
+ sh_unmap_domain_page(l4e);
+ }
+#elif CONFIG_PAGING_LEVELS == 3
+ /* Need to destroy the l2 monitor page in slot 4 too */
+ {
+ l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
+ sh_unmap_domain_page(l3e);
+ }
+#endif
+
+ /* Put the memory back in the pool */
+ shadow_free(d, mmfn);
+}
+#endif
+
+/**************************************************************************/
+/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
+ * These are called from common code when we are running out of shadow
+ * memory, and unpinning all the top-level shadows hasn't worked.
+ *
+ * This implementation is pretty crude and slow, but we hope that it won't
+ * be called very often. */
+
+#if GUEST_PAGING_LEVELS == 2
+
+void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+{
+ shadow_l2e_t *sl2e;
+ int xen_mappings = !shadow_mode_external(v->domain);
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+}
+
+#elif GUEST_PAGING_LEVELS == 3
+
+void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
+/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
+{
+ shadow_l3e_t *sl3e;
+ SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+ if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
+ mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
+ if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask)
+ == PGC_SH_l2h_pae_shadow )
+ {
+ /* High l2: need to pick particular l2es to unhook */
+ shadow_l2e_t *sl2e;
+ SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
+ (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+ });
+ }
+ else
+ {
+ /* Normal l2: can safely unhook the whole l3e */
+ (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+ }
+ }
+ });
+ /* We've changed PAE L3 entries: must sync up various copies of them */
+ sh_pae_recopy(v->domain);
+}
+
+#elif GUEST_PAGING_LEVELS == 4
+
+void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+{
+ shadow_l4e_t *sl4e;
+ int xen_mappings = !shadow_mode_external(v->domain);
+ SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+ (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+ });
+}
+
+#endif
+
+/**************************************************************************/
+/* Internal translation functions.
+ * These functions require a pointer to the shadow entry that will be updated.
+ */
+
+/* These functions take a new guest entry, translate it to shadow and write
+ * the shadow entry.
+ *
+ * They return the same bitmaps as the shadow_set_lXe() functions.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
+{
+ shadow_l4e_t new_sl4e;
+ guest_l4e_t *new_gl4e = new_ge;
+ shadow_l4e_t *sl4p = se;
+ mfn_t sl3mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl4e_calls);
+
+ if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
+ {
+ gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
+ mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
+ if ( valid_mfn(gl3mfn) )
+ sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
+ sl3mfn, &new_sl4e, ft_prefetch);
+ result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
+{
+ shadow_l3e_t new_sl3e;
+ guest_l3e_t *new_gl3e = new_ge;
+ shadow_l3e_t *sl3p = se;
+ mfn_t sl2mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl3e_calls);
+
+ if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
+ {
+ gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
+ mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
+ if ( valid_mfn(gl2mfn) )
+ sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN),
+ sl2mfn, &new_sl3e, ft_prefetch);
+ result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+
+#if GUEST_PAGING_LEVELS == 3
+ /* We have changed a PAE l3 entry: need to sync up the possible copies
+ * of it */
+ if ( result & SHADOW_SET_L3PAE_RECOPY )
+ sh_pae_recopy(v->domain);
+#endif
+
+ return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
+{
+ shadow_l2e_t new_sl2e;
+ guest_l2e_t *new_gl2e = new_ge;
+ shadow_l2e_t *sl2p = se;
+ mfn_t sl1mfn = _mfn(INVALID_MFN);
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl2e_calls);
+
+ if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
+ {
+ gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
+ if ( guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
+ {
+ // superpage -- need to look up the shadow L1 which holds the
+ // splitters...
+ sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+#if 0
+ // XXX - it's possible that we want to do some kind of prefetch
+ // for superpage fl1's here, but this is *not* on the demand path,
+ // so we'll hold off trying that for now...
+ //
+ if ( !valid_mfn(sl1mfn) )
+ sl1mfn = make_fl1_shadow(v, gl1gfn);
+#endif
+ }
+ else
+ {
+ mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
+ if ( valid_mfn(gl1mfn) )
+ sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow);
+ else
+ result |= SHADOW_SET_ERROR;
+ }
+ }
+ l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
+ sl1mfn, &new_sl2e, ft_prefetch);
+ result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+
+ return result;
+}
+
+static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
+{
+ shadow_l1e_t new_sl1e;
+ guest_l1e_t *new_gl1e = new_ge;
+ shadow_l1e_t *sl1p = se;
+ gfn_t gfn;
+ mfn_t mfn;
+ int result = 0;
+
+ perfc_incrc(shadow_validate_gl1e_calls);
+
+ gfn = guest_l1e_get_gfn(*new_gl1e);
+ mfn = vcpu_gfn_to_mfn(v, gfn);
+
+ l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e,
+ /* mmio? */ !valid_mfn(mfn));
+
+ result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+ return result;
+}
+
+
+/**************************************************************************/
+/* Functions which translate and install a the shadows of arbitrary guest
+ * entries that we have just seen the guest write. */
+
+
+static inline int
+sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
+ void *new_gp, u32 size, u32 sh_type,
+ u32 (*shadow_index)(mfn_t *smfn, u32 idx),
+ int (*validate_ge)(struct vcpu *v, void *ge,
+ mfn_t smfn, void *se))
+/* Generic function for mapping and validating. */
+{
+ mfn_t smfn, smfn2, map_mfn;
+ shadow_l1e_t *sl1p;
+ u32 shadow_idx, guest_idx;
+ int result = 0;
+
+ /* Align address and size to guest entry boundaries */
+ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
+ new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
+ size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
+ ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
+
+ /* Map the shadow page */
+ smfn = get_shadow_status(v, gmfn, sh_type);
+ ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
+ guest_idx = guest_index(new_gp);
+ map_mfn = smfn;
+ shadow_idx = shadow_index(&map_mfn, guest_idx);
+ sl1p = map_shadow_page(map_mfn);
+
+ /* Validate one entry at a time */
+ while ( size )
+ {
+ smfn2 = smfn;
+ guest_idx = guest_index(new_gp);
+ shadow_idx = shadow_index(&smfn2, guest_idx);
+ if ( mfn_x(smfn2) != mfn_x(map_mfn) )
+ {
+ /* We have moved to another page of the shadow */
+ map_mfn = smfn2;
+ unmap_shadow_page(sl1p);
+ sl1p = map_shadow_page(map_mfn);
+ }
+ result |= validate_ge(v,
+ new_gp,
+ map_mfn,
+ &sl1p[shadow_idx]);
+ size -= sizeof(guest_l1e_t);
+ new_gp += sizeof(guest_l1e_t);
+ }
+ unmap_shadow_page(sl1p);
+ return result;
+}
+
+
+int
+sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
+ void *new_gl4p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 4
+ return sh_map_and_validate(v, gl4mfn, new_gl4p, size,
+ PGC_SH_l4_shadow,
+ shadow_l4_index,
+ validate_gl4e);
+#else // ! GUEST_PAGING_LEVELS >= 4
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
+ void *new_gl3p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 3
+ return sh_map_and_validate(v, gl3mfn, new_gl3p, size,
+ PGC_SH_l3_shadow,
+ shadow_l3_index,
+ validate_gl3e);
+#else // ! GUEST_PAGING_LEVELS >= 3
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+ return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH_l2_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+}
+
+int
+sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
+ void *new_gl2p, u32 size)
+{
+#if GUEST_PAGING_LEVELS == 3
+ return sh_map_and_validate(v, gl2mfn, new_gl2p, size,
+ PGC_SH_l2h_shadow,
+ shadow_l2_index,
+ validate_gl2e);
+#else /* Non-PAE guests don't have different kinds of l2 table */
+ SHADOW_PRINTK("called in wrong paging mode!\n");
+ BUG();
+ return 0;
+#endif
+}
+
+int
+sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
+ void *new_gl1p, u32 size)
+{
+ return sh_map_and_validate(v, gl1mfn, new_gl1p, size,
+ PGC_SH_l1_shadow,
+ shadow_l1_index,
+ validate_gl1e);
+}
+
+
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry. This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow/unhook if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
+ sh_mfn_is_a_page_table(gmfn) )
+ {
+ u32 flags = mfn_to_page(gmfn)->shadow_flags;
+ mfn_t smfn;
+ if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) )
+ {
+ perfc_incrc(shadow_early_unshadow);
+ sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
+ return;
+ }
+ /* SHF_unhooked_mappings is set to make sure we only unhook
+ * once in a single batch of updates. It is reset when this
+ * top-level page is loaded into CR3 again */
+ if ( !(flags & SHF_unhooked_mappings) )
+ {
+ perfc_incrc(shadow_early_unshadow_top);
+ mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings;
+ if ( flags & SHF_L2_32 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ if ( flags & SHF_L3_PAE )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ if ( flags & SHF_L4_64 )
+ {
+ smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow);
+ shadow_unhook_mappings(v, smfn);
+ }
+ }
+ }
+ v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
+#endif
+}
+
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ v->arch.shadow.last_emulated_mfn = INVALID_MFN;
+#endif
+}
+
+
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults. Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+
+static int sh_page_fault(struct vcpu *v,
+ unsigned long va,
+ struct cpu_user_regs *regs)
+{
+ struct domain *d = v->domain;
+ walk_t gw;
+ u32 accumulated_gflags;
+ gfn_t gfn;
+ mfn_t gmfn, sl1mfn=_mfn(0);
+ shadow_l1e_t sl1e, *ptr_sl1e;
+ paddr_t gpa;
+ struct cpu_user_regs emul_regs;
+ struct x86_emulate_ctxt emul_ctxt;
+ int r, mmio;
+ fetch_type_t ft = 0;
+
+ //
+ // XXX: Need to think about eventually mapping superpages directly in the
+ // shadow (when possible), as opposed to splintering them into a
+ // bunch of 4K maps.
+ //
+
+ SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+ v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+
+ shadow_lock(d);
+
+ shadow_audit_tables(v);
+
+ if ( guest_walk_tables(v, va, &gw, 1) != 0 )
+ {
+ SHADOW_PRINTK("malformed guest pagetable!");
+ print_gw(&gw);
+ }
+
+ sh_audit_gw(v, &gw);
+
+ // We do not look at the gw->l1e, as that will not exist for superpages.
+ // Instead, we use the gw->eff_l1e...
+ //
+ // We need not check all the levels of the guest page table entries for
+ // present vs not-present, as the eff_l1e will always be not present if
+ // one of the higher level entries is not present.
+ //
+ if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
+ {
+ if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) )
+ {
+ /* Not present in p2m map, means this is mmio */
+ gpa = va;
+ goto mmio;
+ }
+
+ perfc_incrc(shadow_fault_bail_not_present);
+ goto not_a_shadow_fault;
+ }
+
+ // All levels of the guest page table are now known to be present.
+ accumulated_gflags = accumulate_guest_flags(&gw);
+
+ // Check for attempts to access supervisor-only pages from user mode,
+ // i.e. ring 3. Such errors are not caused or dealt with by the shadow
+ // code.
+ //
+ if ( (regs->error_code & PFEC_user_mode) &&
+ !(accumulated_gflags & _PAGE_USER) )
+ {
+ /* illegal user-mode access to supervisor-only page */
+ perfc_incrc(shadow_fault_bail_user_supervisor);
+ goto not_a_shadow_fault;
+ }
+
+ // Was it a write fault?
+ //
+ if ( regs->error_code & PFEC_write_access )
+ {
+ if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
+ {
+ perfc_incrc(shadow_fault_bail_ro_mapping);
+ goto not_a_shadow_fault;
+ }
+ }
+ else // must have been either an insn fetch or read fault
+ {
+ // Check for NX bit violations: attempts to execute code that is
+ // marked "do not execute". Such errors are not caused or dealt with
+ // by the shadow code.
+ //
+ if ( regs->error_code & PFEC_insn_fetch )
+ {
+ if ( accumulated_gflags & _PAGE_NX_BIT )
+ {
+ /* NX prevented this code fetch */
+ perfc_incrc(shadow_fault_bail_nx);
+ goto not_a_shadow_fault;
+ }
+ }
+ }
+
+ /* Is this an MMIO access? */
+ gfn = guest_l1e_get_gfn(gw.eff_l1e);
+ mmio = ( hvm_guest(v)
+ && shadow_vcpu_mode_translate(v)
+ && mmio_space(gfn_to_paddr(gfn)) );
+
+ /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds
+ * the equivalent mfn. */
+ if ( mmio )
+ gmfn = _mfn(gfn_x(gfn));
+ else
+ {
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ if ( !valid_mfn(gmfn) )
+ {
+ perfc_incrc(shadow_fault_bail_bad_gfn);
+ SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n",
+ gfn_x(gfn), mfn_x(gmfn));
+ goto not_a_shadow_fault;
+ }
+ }
+
+ /* Make sure there is enough free shadow memory to build a chain of
+ * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
+ * to allocate all we need. (We never allocate a top-level shadow
+ * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+
+ /* Acquire the shadow. This must happen before we figure out the rights
+ * for the shadow entry, since we might promote a page here. */
+ // XXX -- this code will need to change somewhat if/when the shadow code
+ // can directly map superpages...
+ ft = ((regs->error_code & PFEC_write_access) ?
+ ft_demand_write : ft_demand_read);
+ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
+ ASSERT(ptr_sl1e);
+
+ /* Calculate the shadow entry */
+ if ( ft == ft_demand_write )
+ {
+ if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow_fault_emulate_write);
+ goto emulate;
+ }
+ }
+ else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
+ {
+ perfc_incrc(shadow_fault_emulate_read);
+ goto emulate;
+ }
+
+ /* Quick sanity check: we never make an MMIO entry that's got the
+ * _PAGE_PRESENT flag set in it. */
+ ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
+
+ r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+ if ( mmio )
+ {
+ gpa = guest_walk_to_gpa(&gw);
+ goto mmio;
+ }
+
+#if 0
+ if ( !(r & SHADOW_SET_CHANGED) )
+ debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
+ ") did not change anything\n",
+ __func__, gw.va, l1e_get_intpte(sl1e));
+#endif
+
+ perfc_incrc(shadow_fault_fixed);
+ d->arch.shadow.fault_count++;
+ reset_early_unshadow(v);
+
+ done:
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("fixed\n");
+ shadow_audit_tables(v);
+ shadow_unlock(d);
+ return EXCRET_fault_fixed;
+
+ emulate:
+
+ /* Take the register set we were called with */
+ emul_regs = *regs;
+ if ( hvm_guest(v) )
+ {
+ /* Add the guest's segment selectors, rip, rsp. rflags */
+ hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
+ }
+ emul_ctxt.regs = &emul_regs;
+ emul_ctxt.cr2 = va;
+ emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
+
+ SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
+
+ v->arch.shadow.propagate_fault = 0;
+ if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
+ {
+ SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
+ mfn_x(gmfn));
+ perfc_incrc(shadow_fault_emulate_failed);
+ /* If this is actually a page table, then we have a bug, and need
+ * to support more operations in the emulator. More likely,
+ * though, this is a hint that this page should not be shadowed. */
+ shadow_remove_all_shadows(v, gmfn);
+ /* This means that actual missing operations will cause the
+ * guest to loop on the same page fault. */
+ goto done;
+ }
+ if ( v->arch.shadow.propagate_fault )
+ {
+ /* Emulation triggered another page fault */
+ goto not_a_shadow_fault;
+ }
+
+ /* Emulator has changed the user registers: write back */
+ if ( hvm_guest(v) )
+ {
+ /* Write back the guest's segment selectors, rip, rsp. rflags */
+ hvm_load_cpu_guest_regs(v, &emul_regs);
+ /* And don't overwrite those in the caller's regs. */
+ emul_regs.eip = regs->eip;
+ emul_regs.cs = regs->cs;
+ emul_regs.eflags = regs->eflags;
+ emul_regs.esp = regs->esp;
+ emul_regs.ss = regs->ss;
+ emul_regs.es = regs->es;
+ emul_regs.ds = regs->ds;
+ emul_regs.fs = regs->fs;
+ emul_regs.gs = regs->gs;
+ }
+ *regs = emul_regs;
+
+ goto done;
+
+ mmio:
+ perfc_incrc(shadow_fault_mmio);
+ if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
+ {
+ /* Need to deal with these disabled-APIC accesses, as
+ * handle_mmio() apparently does not currently do that. */
+ /* TJD: What about it, then? For now, I'm turning this BUG()
+ * into a domain_crash() since we don't want to kill Xen. */
+ SHADOW_ERROR("disabled-APIC access: not supported\n.");
+ domain_crash(d);
+ }
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("mmio\n");
+ shadow_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow_unlock(d);
+ sh_log_mmio(v, gpa);
+ handle_mmio(va, gpa);
+ return EXCRET_fault_fixed;
+
+ not_a_shadow_fault:
+ sh_audit_gw(v, &gw);
+ unmap_walk(v, &gw);
+ SHADOW_PRINTK("not a shadow fault\n");
+ shadow_audit_tables(v);
+ reset_early_unshadow(v);
+ shadow_unlock(d);
+ return 0;
+}
+
+
+static int
+sh_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg. Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+ shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
+
+ // XXX -- might be a good thing to prefetch the va into the shadow
+
+ // no need to flush anything if there's no SL2...
+ //
+ if ( !ptr_sl2e )
+ return 0;
+
+ // If there's nothing shadowed for this particular sl2e, then
+ // there is no need to do an invlpg, either...
+ //
+ if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
+ return 0;
+
+ // Check to see if the SL2 is a splintered superpage...
+ // If so, then we'll need to flush the entire TLB (because that's
+ // easier than invalidating all of the individual 4K pages).
+ //
+ if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
+ PGC_SH_type_mask) == PGC_SH_fl1_shadow )
+ {
+ local_flush_tlb();
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned long
+sh_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ walk_t gw;
+ gfn_t gfn;
+
+ guest_walk_tables(v, va, &gw, 0);
+ gfn = guest_walk_to_gfn(&gw);
+ unmap_walk(v, &gw);
+
+ return gfn_x(gfn);
+}
+
+
+static unsigned long
+sh_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+ unsigned long gfn = sh_gva_to_gfn(v, va);
+ if ( gfn == INVALID_GFN )
+ return 0;
+ else
+ return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow-common.c?
+//
+/* returns a lowmem machine address of the copied HVM L3 root table
+ * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
+ * otherwise blank out any entries with reserved bits in them. */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long
+hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
+{
+ int i, f;
+ int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
+ l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
+ for ( i = 0; i < 4; i++ )
+ {
+ f = l3e_get_flags(l3tab[i]);
+ if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
+ new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
+ else
+ new_l3e = l3e_empty();
+ safe_write_entry(©[i], &new_l3e);
+ }
+ return __pa(copy);
+}
+#endif
+
+
+static inline void
+sh_update_linear_entries(struct vcpu *v)
+/* Sync up all the linear mappings for this vcpu's pagetables */
+{
+ struct domain *d = v->domain;
+
+ /* Linear pagetables in PV guests
+ * ------------------------------
+ *
+ * Guest linear pagetables, which map the guest pages, are at
+ * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the
+ * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these
+ * are set up at shadow creation time, but (of course!) the PAE case
+ * is subtler. Normal linear mappings are made by having an entry
+ * in the top-level table that points to itself (shadow linear) or
+ * to the guest top-level table (guest linear). For PAE, to set up
+ * a linear map requires us to copy the four top-level entries into
+ * level-2 entries. That means that every time we change a PAE l3e,
+ * we need to reflect the change into the copy.
+ *
+ * Linear pagetables in HVM guests
+ * -------------------------------
+ *
+ * For HVM guests, the linear pagetables are installed in the monitor
+ * tables (since we can't put them in the shadow). Shadow linear
+ * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
+ * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for
+ * a linear pagetable of the monitor tables themselves. We have
+ * the same issue of having to re-copy PAE l3 entries whevever we use
+ * PAE shadows.
+ *
+ * Because HVM guests run on the same monitor tables regardless of the
+ * shadow tables in use, the linear mapping of the shadow tables has to
+ * be updated every time v->arch.shadow_table changes.
+ */
+
+ /* Don't try to update the monitor table if it doesn't exist */
+ if ( shadow_mode_external(d)
+ && pagetable_get_pfn(v->arch.monitor_table) == 0 )
+ return;
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
+
+ /* For PV, one l4e points at the guest l4, one points at the shadow
+ * l4. No maintenance required.
+ * For HVM, just need to update the l4e that points to the shadow l4. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e =
sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(ml4e);
+ }
+ }
+
+#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
+
+ /* This case only exists in HVM. To give ourselves a linear map of the
+ * shadows, we need to extend a PAE shadow to 4 levels. We do this by
+ * having a monitor l3 in slot 0 of the monitor l4 table, and
+ * copying the PAE l3 entries into it. Then, by having the monitor l4e
+ * for shadow pagetables also point to the monitor l4, we can use it
+ * to access the shadows. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Install copies of the shadow l3es into the monitor l3 table.
+ * The monitor l3 table is hooked into slot 0 of the monitor
+ * l4 table, so we use l3 linear indices 0 to 3 */
+ shadow_l3e_t *sl3e;
+ l3_pgentry_t *ml3e;
+ mfn_t l3mfn;
+ int i;
+
+ /* Use linear mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ ml3e = __linear_l3_table;
+ l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = v->arch.shadow_vtable;
+#endif
+ }
+ else
+ {
+ l4_pgentry_t *ml4e;
+ ml4e =
sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
+ l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
+ ml3e = sh_map_domain_page(l3mfn);
+ sh_unmap_domain_page(ml4e);
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables are made up by update_cr3 */
+ sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+ sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
+#endif
+ }
+
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ ml3e[i] =
+ (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT)
+ ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])),
+ __PAGE_HYPERVISOR)
+ : l3e_empty();
+ }
+
+ if ( v != current )
+ {
+ sh_unmap_domain_page(ml3e);
+#if GUEST_PAGING_LEVELS != 2
+ sh_unmap_domain_page(sl3e);
+#endif
+ }
+ }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+ /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
+ * entries in the shadow, and the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the shadow. This is safe to do
+ * because Xen does not let guests share high-slot l2 tables between l3s,
+ * so we know we're not treading on anyone's toes.
+ *
+ * HVM: need to copy the shadow's l3 entries into the
+ * shadow-linear-map l2 entries in the monitor table. This is safe
+ * because we have one monitor table for each vcpu. The monitor's
+ * own l3es don't need to be copied because they never change.
+ * XXX That might change if we start stuffing things into the rest
+ * of the monitor's virtual address space.
+ */
+ {
+ l2_pgentry_t *l2e, new_l2e;
+ shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
+ int i;
+
+#if GUEST_PAGING_LEVELS == 2
+ /* Shadow l3 tables were built by update_cr3 */
+ if ( shadow_mode_external(d) )
+ shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+ else
+ BUG(); /* PV 2-on-3 is not supported yet */
+
+#else /* GUEST_PAGING_LEVELS == 3 */
+
+ /* Use local vcpu's mappings if we can; otherwise make new mappings */
+ if ( v == current )
+ {
+ shadow_l3e = v->arch.shadow_vtable;
+ if ( !shadow_mode_external(d) )
+ guest_l3e = v->arch.guest_vtable;
+ }
+ else
+ {
+ mfn_t smfn;
+ int idx;
+
+ /* Map the shadow l3 */
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
+ shadow_l3e = sh_map_domain_page(smfn);
+ shadow_l3e += idx;
+ if ( !shadow_mode_external(d) )
+ {
+ /* Also the guest l3 */
+ mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table);
+ guest_l3e = sh_map_domain_page(gmfn);
+ guest_l3e += guest_index(v->arch.guest_vtable);
+ }
+ }
+#endif /* GUEST_PAGING_LEVELS */
+
+ /* Choose where to write the entries, using linear maps if possible */
+ if ( v == current && shadow_mode_external(d) )
+ {
+ /* From the monitor tables, it's safe to use linear maps to update
+ * monitor l2s */
+ l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
+ }
+ else if ( shadow_mode_external(d) )
+ {
+ /* Map the monitor table's high l2 */
+ l3_pgentry_t *l3e;
+ l3e = sh_map_domain_page(
+ pagetable_get_mfn(v->arch.monitor_table));
+ ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+ l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
+ sh_unmap_domain_page(l3e);
+ }
+ else
+ {
+ /* Map the shadow table's high l2 */
+ ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
+ l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
+ }
+
+
+ if ( !shadow_mode_external(d) )
+ {
+ /* Write linear mapping of guest. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+ }
+
+ /* Write linear mapping of shadow. */
+ for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+ {
+ new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT)
+ ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
+ __PAGE_HYPERVISOR)
+ : l2e_empty();
+ safe_write_entry(
+ &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
+ &new_l2e);
+ }
+
+ if ( v != current || !shadow_mode_external(d) )
+ sh_unmap_domain_page(l2e);
+
+#if GUEST_PAGING_LEVELS == 3
+ if ( v != current)
+ {
+ sh_unmap_domain_page(shadow_l3e);
+ if ( !shadow_mode_external(d) )
+ sh_unmap_domain_page(guest_l3e);
+ }
+#endif
+ }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+ /* For PV, one l2e points at the guest l2, one points at the shadow
+ * l2. No maintenance required.
+ * For HVM, just need to update the l2e that points to the shadow l2. */
+
+ if ( shadow_mode_external(d) )
+ {
+ /* Use the linear map if we can; otherwise make a new mapping */
+ if ( v == current )
+ {
+ __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ }
+ else
+ {
+ l2_pgentry_t *ml2e;
+ ml2e =
sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+ ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+ l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+ __PAGE_HYPERVISOR);
+ sh_unmap_domain_page(ml2e);
+ }
+ }
+
+#else
+#error this should not happen
+#endif
+}
+
+
+// XXX -- should this be in this file?
+// Or should it be moved to shadow-common.c?
+//
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh_pae_recopy(struct domain *d)
+/* Called whenever we write to the l3 entries of a PAE pagetable which
+ * is currently in use. Each vcpu that is using the table needs to
+ * resync its copies of the l3s in linear maps and any low-memory
+ * copies it might have made for fitting into 32bit CR3.
+ * Since linear maps are also resynced when we change CR3, we don't
+ * need to worry about changes to PAE l3es that are not currently in use.*/
+{
+ struct vcpu *v;
+ cpumask_t flush_mask = CPU_MASK_NONE;
+ ASSERT(shadow_lock_is_acquired(d));
+
+ for_each_vcpu(d, v)
+ {
+ if ( !v->arch.shadow.pae_flip_pending )
+ continue;
+
+ cpu_set(v->processor, flush_mask);
+
+ SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
+
+ /* This vcpu has a copy in its linear maps */
+ sh_update_linear_entries(v);
+ if ( hvm_guest(v) )
+ {
+ /* This vcpu has a copy in its HVM PAE l3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow_vcpu_mode_translate(v));
+ }
+#if CONFIG_PAGING_LEVELS == 3
+ else
+ {
+ /* This vcpu might have copied the l3 to below 4GB */
+ if ( v->arch.cr3 >> PAGE_SHIFT
+ != pagetable_get_pfn(v->arch.shadow_table) )
+ {
+ /* Recopy to where that copy is. */
+ int i;
+ l3_pgentry_t *dst, *src;
+ dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
+ src = v->arch.shadow_vtable;
+ for ( i = 0 ; i < 4 ; i++ )
+ safe_write_entry(dst + i, src + i);
+ }
+ }
+#endif
+ v->arch.shadow.pae_flip_pending = 0;
+ }
+
+ flush_tlb_mask(flush_mask);
+}
+#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
+
+
+/* removes:
+ * vcpu->arch.guest_vtable
+ * vcpu->arch.shadow_table
+ * vcpu->arch.shadow_vtable
+ * Does all appropriate management/bookkeeping/refcounting/etc...
+ */
+static void
+sh_detach_old_tables(struct vcpu *v)
+{
+ mfn_t smfn;
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.guest_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ sh_unmap_domain_page_global(v->arch.guest_vtable);
+ v->arch.guest_vtable = NULL;
+ }
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = pagetable_get_mfn(v->arch.shadow_table);
+ if ( mfn_x(smfn) )
+ {
+ ASSERT(v->arch.shadow_vtable);
+
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
+#else
+ sh_put_ref(v, smfn, 0);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(test_bit(v->vcpu_id, &info->vcpus));
+ clear_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+ v->arch.shadow_table = pagetable_null();
+ }
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+ v->arch.shadow_vtable )
+ {
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ //
+ sh_unmap_domain_page_global(v->arch.shadow_vtable);
+ v->arch.shadow_vtable = NULL;
+ }
+}
+
+static void
+sh_update_cr3(struct vcpu *v)
+/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
+ * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
+ * if appropriate).
+ * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
+ */
+{
+ struct domain *d = v->domain;
+ mfn_t gmfn, smfn;
+#if GUEST_PAGING_LEVELS == 3
+ u32 guest_idx=0;
+#endif
+
+ ASSERT(shadow_lock_is_acquired(v->domain));
+ ASSERT(v->arch.shadow.mode);
+
+ ////
+ //// vcpu->arch.guest_table is already set
+ ////
+
+#ifndef NDEBUG
+ /* Double-check that the HVM code has sent us a sane guest_table */
+ if ( hvm_guest(v) )
+ {
+ gfn_t gfn;
+
+ ASSERT(shadow_mode_external(d));
+
+ // Is paging enabled on this vcpu?
+ if ( shadow_vcpu_mode_translate(v) )
+ {
+ gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
+ gmfn = vcpu_gfn_to_mfn(v, gfn);
+ ASSERT(valid_mfn(gmfn));
+ ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
+ }
+ else
+ {
+ /* Paging disabled: guest_table points at (part of) p2m */
+#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
+ /* For everything else, they sould be the same */
+ ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
+#endif
+ }
+ }
+#endif
+
+ SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
+ d->domain_id, v->vcpu_id,
+ (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+#if GUEST_PAGING_LEVELS == 4
+ if ( !(v->arch.flags & TF_kernel_mode) )
+ gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+ else
+#endif
+ gmfn = pagetable_get_mfn(v->arch.guest_table);
+
+ sh_detach_old_tables(v);
+
+ if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+ {
+ ASSERT(v->arch.cr3 == 0);
+ return;
+ }
+
+ ////
+ //// vcpu->arch.guest_vtable
+ ////
+ if ( shadow_mode_external(d) )
+ {
+#if GUEST_PAGING_LEVELS == 3
+ if ( shadow_vcpu_mode_translate(v) )
+ /* Paging enabled: find where in the page the l3 table is */
+ guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
+ else
+ /* Paging disabled: l3 is at the start of a page (in the p2m) */
+ guest_idx = 0;
+
+ // Ignore the low 2 bits of guest_idx -- they are really just
+ // cache control.
+ guest_idx &= ~3;
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable =
+ (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
+#else
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
+#endif
+ }
+ else
+ {
+#ifdef __x86_64__
+ v->arch.guest_vtable = __linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
+#else
+ v->arch.guest_vtable = __linear_l2_table;
+#endif
+ }
+
+#if 0
+ printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
+ __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
+#endif
+
+ ////
+ //// vcpu->arch.shadow_table
+ ////
+ smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type);
+ if ( valid_mfn(smfn) )
+ {
+ /* Pull this root shadow to the front of the list of roots. */
+ list_del(&mfn_to_page(smfn)->list);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
+ }
+ else
+ {
+ /* This guest MFN is a pagetable. Must revoke write access. */
+ if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0)
+ != 0 )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+ /* Make sure there's enough free shadow memory. */
+ shadow_prealloc(d, SHADOW_MAX_ORDER);
+ /* Shadow the page. */
+ smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type);
+ list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
+ }
+ ASSERT(valid_mfn(smfn));
+ v->arch.shadow_table = pagetable_from_mfn(smfn);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
+ /* Once again OK to unhook entries from this table if we see fork/exit */
+ ASSERT(sh_mfn_is_a_page_table(gmfn));
+ mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
+#endif
+
+
+ ////
+ //// vcpu->arch.shadow_vtable
+ ////
+ if ( shadow_mode_external(d) )
+ {
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ mfn_t adjusted_smfn = smfn;
+ u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable =
+ (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) +
+ shadow_idx;
+#else
+ // Q: why does this need to use (un)map_domain_page_*global* ?
+ v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
+#endif
+ }
+ else
+ {
+#if SHADOW_PAGING_LEVELS == 4
+ v->arch.shadow_vtable = __sh_linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+ // XXX - why does this need a global map?
+ v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
+#else
+ v->arch.shadow_vtable = __sh_linear_l2_table;
+#endif
+ }
+
+ ////
+ //// Take a ref to the new shadow table, and pin it.
+ ////
+ //
+ // This ref is logically "held" by v->arch.shadow_table entry itself.
+ // Release the old ref.
+ //
+#if GUEST_PAGING_LEVELS == 3
+ // PAE guests do not (necessarily) use an entire page for their
+ // 4-entry L3s, so we have to deal with them specially.
+ //
+ // XXX - might want to revisit this if/when we do multiple compilation for
+ // HVM-vs-PV guests, as PAE PV guests could get away without doing
+ // subshadows.
+ //
+ sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
+ sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
+#else
+ sh_get_ref(smfn, 0);
+ sh_pin(smfn);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+ // PAE 3-on-3 shadows have to keep track of which vcpu's are using
+ // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY
+ // case from validate_gl3e(). Search for SHADOW_SET_L3PAE_RECOPY
+ // in the code for more info.
+ //
+ {
+ struct pae_l3_bookkeeping *info =
+ sl3p_to_info(v->arch.shadow_vtable);
+ ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
+ set_bit(v->vcpu_id, &info->vcpus);
+ }
+#endif
+
+ debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
+ __func__, gmfn, smfn);
+
+ ///
+ /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
+ ///
+ if ( shadow_mode_external(d) )
+ {
+ ASSERT(hvm_guest(v));
+ make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
+
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+#if SHADOW_PAGING_LEVELS != 3
+#error unexpected combination of GUEST and SHADOW paging levels
+#endif
+ /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
+ int i;
+
+ ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
+ virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
+ for (i = 0; i < 4; i++)
+ {
+ v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
+ shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
+ }
+ }
+#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+ /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
+ * If paging is disabled, clear l3e reserved bits; otherwise
+ * remove entries that have reserved bits set. */
+ v->arch.hvm_vcpu.hw_cr3 =
+ hvm_pae_copy_root(v, v->arch.shadow_vtable,
+ !shadow_vcpu_mode_translate(v));
+#else
+ /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
+ v->arch.hvm_vcpu.hw_cr3 =
+ pagetable_get_paddr(v->arch.shadow_table);
+#endif
+ }
+ else // not shadow_mode_external...
+ {
+ /* We don't support PV except guest == shadow == config levels */
+ BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+ make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
+ }
+
+ /* Fix up the linear pagetable mappings */
+ sh_update_linear_entries(v);
+}
+
+
+/**************************************************************************/
+/* Functions to revoke guest rights */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
+static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+/* Look up this vaddr in the current shadow and see if it's a writeable
+ * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */
+{
+ shadow_l1e_t sl1e, *sl1p;
+ shadow_l2e_t *sl2p;
+#if GUEST_PAGING_LEVELS >= 3
+ shadow_l3e_t *sl3p;
+#if GUEST_PAGING_LEVELS >= 4
+ shadow_l4e_t *sl4p;
+#endif
+#endif
+ mfn_t sl1mfn;
+
+
+ /* Carefully look in the shadow linear map for the l1e we expect */
+ if ( v->arch.shadow_vtable == NULL ) return 0;
+#if GUEST_PAGING_LEVELS >= 4
+ sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+ return 0;
+ sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
+ if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+ return 0;
+#elif GUEST_PAGING_LEVELS == 3
+ sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable)
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|