# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID f34e37d0742d80ccfefd017a91f93310ebc2dfe8
# Parent 9da2d9b48ff8711516a07f7a06120abedb4e24b2
# Parent bd811e94d293ebcb8fb15db0becacd36c65a4ac7
merge with xen-unstable.hg
---
xen/arch/x86/hvm/svm/instrlen.c | 479 ------------------------------
docs/man/xm.pod.1 | 4
docs/src/user.tex | 2
tools/firmware/vmxassist/vm86.c | 37 --
tools/ioemu/hw/serial.c | 68 ++++
tools/ioemu/usb-linux.c | 3
tools/ioemu/vl.c | 2
tools/ioemu/vnc.c | 33 +-
tools/python/xen/xend/XendCheckpoint.py | 4
tools/python/xen/xend/XendDomain.py | 17 -
tools/python/xen/xend/server/SrvDomain.py | 2
tools/python/xen/xm/main.py | 8
xen/arch/ia64/xen/domain.c | 6
xen/arch/ia64/xen/xensetup.c | 8
xen/arch/x86/Rules.mk | 2
xen/arch/x86/boot/x86_32.S | 26 -
xen/arch/x86/hvm/Makefile | 1
xen/arch/x86/hvm/hvm.c | 70 +++-
xen/arch/x86/hvm/i8259.c | 8
xen/arch/x86/hvm/instrlen.c | 474 +++++++++++++++++++++++++++++
xen/arch/x86/hvm/platform.c | 48 +--
xen/arch/x86/hvm/svm/Makefile | 1
xen/arch/x86/hvm/svm/svm.c | 27 -
xen/arch/x86/hvm/vmx/vmcs.c | 135 ++++++--
xen/arch/x86/hvm/vmx/vmx.c | 87 ++---
xen/arch/x86/irq.c | 12
xen/arch/x86/setup.c | 7
xen/common/domain.c | 6
xen/common/gdbstub.c | 3
xen/common/schedule.c | 26 +
xen/include/asm-x86/hvm/hvm.h | 18 -
xen/include/asm-x86/hvm/vmx/vmcs.h | 15
xen/include/asm-x86/hvm/vmx/vmx.h | 93 -----
xen/include/xen/compiler.h | 2
xen/include/xen/sched.h | 2
35 files changed, 919 insertions(+), 817 deletions(-)
diff -r 9da2d9b48ff8 -r f34e37d0742d docs/man/xm.pod.1
--- a/docs/man/xm.pod.1 Tue Sep 26 16:15:45 2006 -0600
+++ b/docs/man/xm.pod.1 Tue Sep 26 19:11:33 2006 -0600
@@ -393,7 +393,9 @@ specified, VCPU information for all doma
=item B<vcpu-pin> I<domain-id> I<vcpu> I<cpus>
-Pins the the VCPU to only run on the specific CPUs.
+Pins the the VCPU to only run on the specific CPUs. The keyword
+I<all> can be used to apply the I<cpus> list to all VCPUs in the
+domain.
Normally VCPUs can float between available CPUs whenever Xen deems a
different run state is appropriate. Pinning can be used to restrict
diff -r 9da2d9b48ff8 -r f34e37d0742d docs/src/user.tex
--- a/docs/src/user.tex Tue Sep 26 16:15:45 2006 -0600
+++ b/docs/src/user.tex Tue Sep 26 19:11:33 2006 -0600
@@ -3208,6 +3208,8 @@ editing \path{grub.conf}.
respectively; if no suffix is specified, the parameter defaults to
kilobytes. In previous versions of Xen, suffixes were not supported
and the value is always interpreted as kilobytes.
+\item [ dom0\_vcpus\_pin ] Pins domain 0 VCPUs on their respective
+ physical CPUS (default=false).
\item [ tbuf\_size=xxx ] Set the size of the per-cpu trace buffers, in
pages (default 0).
\item [ sched=xxx ] Select the CPU scheduler Xen should use. The
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/firmware/vmxassist/vm86.c
--- a/tools/firmware/vmxassist/vm86.c Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/firmware/vmxassist/vm86.c Tue Sep 26 19:11:33 2006 -0600
@@ -69,28 +69,23 @@ guest_linear_to_real(uint32_t base)
if (!(oldctx.cr4 & CR4_PAE)) {
l1_mfn = ((uint32_t *)gcr3)[(base >> 22) & 0x3ff];
-
- if (oldctx.cr4 & CR4_PSE || l1_mfn & PDE_PS) {
- /* 1 level page table */
- l0_mfn = l1_mfn;
- if (!(l0_mfn & PT_ENTRY_PRESENT))
- panic("l1 entry not present\n");
-
- l0_mfn &= 0xffc00000;
+ if (!(l1_mfn & PT_ENTRY_PRESENT))
+ panic("l2 entry not present\n");
+
+ if ((oldctx.cr4 & CR4_PSE) && (l1_mfn & PDE_PS)) {
+ l0_mfn = l1_mfn & 0xffc00000;
return l0_mfn + (base & 0x3fffff);
}
- if (!(l1_mfn & PT_ENTRY_PRESENT))
- panic("l2 entry not present\n");
-
l1_mfn &= 0xfffff000;
+
l0_mfn = ((uint32_t *)l1_mfn)[(base >> 12) & 0x3ff];
if (!(l0_mfn & PT_ENTRY_PRESENT))
panic("l1 entry not present\n");
l0_mfn &= 0xfffff000;
return l0_mfn + (base & 0xfff);
- } else if (oldctx.cr4 & CR4_PAE && !(oldctx.cr4 & CR4_PSE)) {
+ } else {
l2_mfn = ((uint64_t *)gcr3)[(base >> 30) & 0x3];
if (!(l2_mfn & PT_ENTRY_PRESENT))
panic("l3 entry not present\n");
@@ -99,6 +94,12 @@ guest_linear_to_real(uint32_t base)
l1_mfn = ((uint64_t *)l2_mfn)[(base >> 21) & 0x1ff];
if (!(l1_mfn & PT_ENTRY_PRESENT))
panic("l2 entry not present\n");
+
+ if (l1_mfn & PDE_PS) { /* CR4.PSE is ignored in PAE mode */
+ l0_mfn = l1_mfn & 0x3ffe00000ULL;
+ return l0_mfn + (base & 0x1fffff);
+ }
+
l1_mfn &= 0x3fffff000ULL;
l0_mfn = ((uint64_t *)l1_mfn)[(base >> 12) & 0x1ff];
@@ -107,18 +108,6 @@ guest_linear_to_real(uint32_t base)
l0_mfn &= 0x3fffff000ULL;
return l0_mfn + (base & 0xfff);
- } else { /* oldctx.cr4 & CR4_PAE && oldctx.cr4 & CR4_PSE */
- l1_mfn = ((uint64_t *)gcr3)[(base >> 30) & 0x3];
- if (!(l1_mfn & PT_ENTRY_PRESENT))
- panic("l2 entry not present\n");
- l1_mfn &= 0x3fffff000ULL;
-
- l0_mfn = ((uint64_t *)l1_mfn)[(base >> 21) & 0x1ff];
- if (!(l0_mfn & PT_ENTRY_PRESENT))
- panic("l1 entry not present\n");
- l0_mfn &= 0x3ffe00000ULL;
-
- return l0_mfn + (base & 0x1fffff);
}
}
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/ioemu/hw/serial.c
--- a/tools/ioemu/hw/serial.c Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/ioemu/hw/serial.c Tue Sep 26 19:11:33 2006 -0600
@@ -22,6 +22,9 @@
* THE SOFTWARE.
*/
#include "vl.h"
+#include <sys/time.h>
+#include <time.h>
+#include <assert.h>
//#define DEBUG_SERIAL
@@ -138,6 +141,67 @@ static void serial_update_parameters(Ser
printf("speed=%d parity=%c data=%d stop=%d\n",
speed, parity, data_bits, stop_bits);
#endif
+}
+
+/* Rate limit serial requests so that e.g. grub on a serial console
+ doesn't kill dom0. Simple token bucket. If we get some actual
+ data from the user, instantly refil the bucket. */
+
+/* How long it takes to generate a token, in microseconds. */
+#define TOKEN_PERIOD 1000
+/* Maximum and initial size of token bucket */
+#define TOKENS_MAX 100000
+
+static int tokens_avail;
+
+static void serial_get_token(void)
+{
+ static struct timeval last_refil_time;
+ static int started;
+
+ assert(tokens_avail >= 0);
+ if (!tokens_avail) {
+ struct timeval delta, now;
+ int generated;
+
+ if (!started) {
+ gettimeofday(&last_refil_time, NULL);
+ tokens_avail = TOKENS_MAX;
+ started = 1;
+ return;
+ }
+ retry:
+ gettimeofday(&now, NULL);
+ delta.tv_sec = now.tv_sec - last_refil_time.tv_sec;
+ delta.tv_usec = now.tv_usec - last_refil_time.tv_usec;
+ if (delta.tv_usec < 0) {
+ delta.tv_usec += 1000000;
+ delta.tv_sec--;
+ }
+ assert(delta.tv_usec >= 0 && delta.tv_sec >= 0);
+ if (delta.tv_usec < TOKEN_PERIOD) {
+ struct timespec ts;
+ /* Wait until at least one token is available. */
+ ts.tv_sec = TOKEN_PERIOD / 1000000;
+ ts.tv_nsec = (TOKEN_PERIOD % 1000000) * 1000;
+ while (nanosleep(&ts, &ts) < 0 && errno == EINTR)
+ ;
+ goto retry;
+ }
+ generated = (delta.tv_sec * 1000000) / TOKEN_PERIOD;
+ generated +=
+ ((delta.tv_sec * 1000000) % TOKEN_PERIOD + delta.tv_usec) /
TOKEN_PERIOD;
+ assert(generated > 0);
+
+ last_refil_time.tv_usec += (generated * TOKEN_PERIOD) % 1000000;
+ last_refil_time.tv_sec += last_refil_time.tv_usec / 1000000;
+ last_refil_time.tv_usec %= 1000000;
+ last_refil_time.tv_sec += (generated * TOKEN_PERIOD) / 1000000;
+ if (generated > TOKENS_MAX)
+ generated = TOKENS_MAX;
+ tokens_avail = generated;
+ }
+ tokens_avail--;
}
static void serial_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -245,9 +309,11 @@ static uint32_t serial_ioport_read(void
ret = s->mcr;
break;
case 5:
+ serial_get_token();
ret = s->lsr;
break;
case 6:
+ serial_get_token();
if (s->mcr & UART_MCR_LOOP) {
/* in loopback, the modem output pins are connected to the
inputs */
@@ -296,12 +362,14 @@ static void serial_receive1(void *opaque
static void serial_receive1(void *opaque, const uint8_t *buf, int size)
{
SerialState *s = opaque;
+ tokens_avail = TOKENS_MAX;
serial_receive_byte(s, buf[0]);
}
static void serial_event(void *opaque, int event)
{
SerialState *s = opaque;
+ tokens_avail = TOKENS_MAX;
if (event == CHR_EVENT_BREAK)
serial_receive_break(s);
}
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/ioemu/usb-linux.c
--- a/tools/ioemu/usb-linux.c Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/ioemu/usb-linux.c Tue Sep 26 19:11:33 2006 -0600
@@ -26,6 +26,9 @@
#if defined(__linux__)
#include <dirent.h>
#include <sys/ioctl.h>
+/* Some versions of usbdevice_fs.h need __user to be defined for them. */
+/* This may (harmlessly) conflict with a definition in linux/compiler.h. */
+#define __user
#include <linux/usbdevice_fs.h>
#include <linux/version.h>
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/ioemu/vl.c
--- a/tools/ioemu/vl.c Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/ioemu/vl.c Tue Sep 26 19:11:33 2006 -0600
@@ -727,7 +727,7 @@ void qemu_del_timer(QEMUTimer *ts)
void qemu_advance_timer(QEMUTimer *ts, int64_t expire_time)
{
- if (ts->expire_time > expire_time)
+ if (ts->expire_time > expire_time || !qemu_timer_pending(ts))
qemu_mod_timer(ts, expire_time);
}
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/ioemu/vnc.c
--- a/tools/ioemu/vnc.c Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/ioemu/vnc.c Tue Sep 26 19:11:33 2006 -0600
@@ -26,6 +26,7 @@
#include "vl.h"
#include "qemu_socket.h"
+#include <assert.h>
/* The refresh interval starts at BASE. If we scan the buffer and
find no change, we increase by INC, up to MAX. If the mouse moves
@@ -580,12 +581,16 @@ static void _vnc_update_client(void *opa
interested (e.g. minimised) it'll ignore this, and we
can stop scanning the buffer until it sends another
update request. */
- /* Note that there are bugs in xvncviewer which prevent
- this from actually working. Leave the code in place
- for correct clients. */
+ /* It turns out that there's a bug in realvncviewer 4.1.2
+ which means that if you send a proper null update (with
+ no update rectangles), it gets a bit out of sync and
+ never sends any further requests, regardless of whether
+ it needs one or not. Fix this by sending a single 1x1
+ update rectangle instead. */
vnc_write_u8(vs, 0);
vnc_write_u8(vs, 0);
- vnc_write_u16(vs, 0);
+ vnc_write_u16(vs, 1);
+ send_framebuffer_update(vs, 0, 0, 1, 1);
vnc_flush(vs);
vs->last_update_time = now;
return;
@@ -728,8 +733,10 @@ static void vnc_client_read(void *opaque
memmove(vs->input.buffer, vs->input.buffer + len,
vs->input.offset - len);
vs->input.offset -= len;
- } else
+ } else {
+ assert(ret > vs->read_handler_expect);
vs->read_handler_expect = ret;
+ }
}
}
@@ -1076,8 +1083,12 @@ static int protocol_client_msg(VncState
if (len == 1)
return 4;
- if (len == 4)
- return 4 + (read_u16(data, 2) * 4);
+ if (len == 4) {
+ uint16_t v;
+ v = read_u16(data, 2);
+ if (v)
+ return 4 + v * 4;
+ }
limit = read_u16(data, 2);
for (i = 0; i < limit; i++) {
@@ -1117,8 +1128,12 @@ static int protocol_client_msg(VncState
if (len == 1)
return 8;
- if (len == 8)
- return 8 + read_u32(data, 4);
+ if (len == 8) {
+ uint32_t v;
+ v = read_u32(data, 4);
+ if (v)
+ return 8 + v;
+ }
client_cut_text(vs, read_u32(data, 4), data + 8);
break;
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/python/xen/xend/XendCheckpoint.py Tue Sep 26 19:11:33 2006 -0600
@@ -161,8 +161,8 @@ def restore(xd, fd):
if handler.store_mfn is None or handler.console_mfn is None:
raise XendError('Could not read store/console MFN')
- #Block until src closes connection
- os.read(fd, 1)
+ os.read(fd, 1) # Wait for source to close connection
+ dominfo.waitForDevices() # Wait for backends to set up
dominfo.unpause()
dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/python/xen/xend/XendDomain.py Tue Sep 26 19:11:33 2006 -0600
@@ -487,10 +487,19 @@ class XendDomain:
if not dominfo:
raise XendInvalidDomain(str(domid))
- try:
- return xc.vcpu_setaffinity(dominfo.getDomid(), vcpu, cpumap)
- except Exception, ex:
- raise XendError(str(ex))
+ # if vcpu is keyword 'all', apply the cpumap to all vcpus
+ vcpus = [ vcpu ]
+ if str(vcpu).lower() == "all":
+ vcpus = range(0, int(dominfo.getVCpuCount()))
+
+ # set the same cpumask for all vcpus
+ rc = 0
+ for v in vcpus:
+ try:
+ rc = xc.vcpu_setaffinity(dominfo.getDomid(), int(v), cpumap)
+ except Exception, ex:
+ raise XendError(str(ex))
+ return rc
def domain_cpu_sedf_set(self, domid, period, slice_, latency, extratime,
weight):
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/python/xen/xend/server/SrvDomain.py
--- a/tools/python/xen/xend/server/SrvDomain.py Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/python/xen/xend/server/SrvDomain.py Tue Sep 26 19:11:33 2006 -0600
@@ -97,7 +97,7 @@ class SrvDomain(SrvDir):
def op_pincpu(self, _, req):
fn = FormFn(self.xd.domain_pincpu,
[['dom', 'int'],
- ['vcpu', 'int'],
+ ['vcpu', 'str'],
['cpumap', 'str']])
val = fn(req.args, {'dom': self.dom.domid})
return val
diff -r 9da2d9b48ff8 -r f34e37d0742d tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py Tue Sep 26 16:15:45 2006 -0600
+++ b/tools/python/xen/xm/main.py Tue Sep 26 19:11:33 2006 -0600
@@ -759,12 +759,16 @@ def xm_vcpu_pin(args):
for i in range(int(x),int(y)+1):
cpus.append(int(i))
else:
- cpus.append(int(c))
+ # remove this element from the list
+ if c[0] == '^':
+ cpus = [x for x in cpus if x != int(c[1:])]
+ else:
+ cpus.append(int(c))
cpus.sort()
return cpus
dom = args[0]
- vcpu = int(args[1])
+ vcpu = args[1]
cpumap = cpu_make_map(args[2])
server.xend.domain.pincpu(dom, vcpu, cpumap)
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/ia64/xen/domain.c
--- a/xen/arch/ia64/xen/domain.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/ia64/xen/domain.c Tue Sep 26 19:11:33 2006 -0600
@@ -54,7 +54,6 @@ static unsigned int dom0_max_vcpus = 1;
static unsigned int dom0_max_vcpus = 1;
integer_param("dom0_max_vcpus", dom0_max_vcpus);
-extern int opt_dom0_vcpus_pin;
extern unsigned long running_on_sim;
extern char dom0_command_line[];
@@ -1021,12 +1020,9 @@ int construct_dom0(struct domain *d,
dom0_max_vcpus = MAX_VIRT_CPUS;
printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
- for ( i = 1; i < dom0_max_vcpus; i++ ) {
+ for ( i = 1; i < dom0_max_vcpus; i++ )
if (alloc_vcpu(d, i, i) == NULL)
printf ("Cannot allocate dom0 vcpu %d\n", i);
- else if (opt_dom0_vcpus_pin)
- d->vcpu[i]->cpu_affinity = cpumask_of_cpu(i);
- }
/* Copy the OS image. */
loaddomainelfimage(d,image_start);
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/ia64/xen/xensetup.c
--- a/xen/arch/ia64/xen/xensetup.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/ia64/xen/xensetup.c Tue Sep 26 19:11:33 2006 -0600
@@ -49,10 +49,6 @@ extern void init_IRQ(void);
extern void init_IRQ(void);
extern void trap_init(void);
-/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
-unsigned int opt_dom0_vcpus_pin = 0;
-boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
-
/* opt_nosmp: If true, secondary processors are ignored. */
static int opt_nosmp = 0;
boolean_param("nosmp", opt_nosmp);
@@ -521,10 +517,6 @@ printk("num_online_cpus=%d, max_cpus=%d\
0) != 0)
panic("Could not set up DOM0 guest OS\n");
- /* PIN domain0 VCPU 0 on CPU 0. */
- if (opt_dom0_vcpus_pin)
- dom0->vcpu[0]->cpu_affinity = cpumask_of_cpu(0);
-
if (!running_on_sim) // slow on ski and pages are pre-initialized to zero
scrub_heap_pages();
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/Rules.mk
--- a/xen/arch/x86/Rules.mk Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/Rules.mk Tue Sep 26 19:11:33 2006 -0600
@@ -44,7 +44,7 @@ CFLAGS += -fno-asynchronous-unwind-tabl
CFLAGS += -fno-asynchronous-unwind-tables
# -fvisibility=hidden reduces -fpic cost, if it's available
CFLAGS += $(shell $(CC) -v --help 2>&1 | grep " -fvisibility=" | \
- grep -q hidden && echo "-fvisibility=hidden")
+ grep -q hidden && echo "-DGCC_HAS_VISIBILITY_ATTRIBUTE")
LDFLAGS += -m elf_x86_64
x86_32 := n
x86_64 := y
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/boot/x86_32.S
--- a/xen/arch/x86/boot/x86_32.S Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/boot/x86_32.S Tue Sep 26 19:11:33 2006 -0600
@@ -218,28 +218,24 @@ nopaging_gdt_descr:
.word LAST_RESERVED_GDT_BYTE
.long gdt_table - FIRST_RESERVED_GDT_BYTE - __PAGE_OFFSET
- .org 0x1000
-/* NB. Rings != 0 get access up to 0xFC400000. This allows access to the */
-/* machine->physical mapping table. Ring 0 can access all memory. */
+ .align PAGE_SIZE, 0
+/* NB. Rings != 0 get access up to MACH2PHYS_VIRT_END. This allows access to */
+/* the machine->physical mapping table. Ring 0 can access all memory. */
+#define GUEST_DESC(d) \
+ .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff, \
+ ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
ENTRY(gdt_table)
.quad 0x0000000000000000 /* unused */
.quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */
-#ifdef CONFIG_X86_PAE
- .quad 0x00cfba00000067ff
- .quad 0x00cfb200000067ff
- .quad 0x00cffa00000067ff
- .quad 0x00cff200000067ff
-#else
- .quad 0x00cfba000000c3ff /* 0xe019 ring 1 3.95GB code at 0x0 */
- .quad 0x00cfb2000000c3ff /* 0xe021 ring 1 3.95GB data at 0x0 */
- .quad 0x00cffa000000c3ff /* 0xe02b ring 3 3.95GB code at 0x0 */
- .quad 0x00cff2000000c3ff /* 0xe033 ring 3 3.95GB data at 0x0 */
-#endif
+ GUEST_DESC(0x00c0ba00) /* 0xe019 ring 1 3.xxGB code at 0x0 */
+ GUEST_DESC(0x00c0b200) /* 0xe021 ring 1 3.xxGB data at 0x0 */
+ GUEST_DESC(0x00c0fa00) /* 0xe02b ring 3 3.xxGB code at 0x0 */
+ GUEST_DESC(0x00c0f200) /* 0xe033 ring 3 3.xxGB data at 0x0 */
.quad 0x0000000000000000 /* unused */
.fill 2*NR_CPUS,8,0 /* space for TSS and LDT per CPU */
- .org 0x2000
+ .align PAGE_SIZE, 0
#ifdef CONFIG_X86_PAE
ENTRY(idle_pg_table)
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/Makefile
--- a/xen/arch/x86/hvm/Makefile Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/Makefile Tue Sep 26 19:11:33 2006 -0600
@@ -4,6 +4,7 @@ obj-y += hvm.o
obj-y += hvm.o
obj-y += i8254.o
obj-y += i8259.o
+obj-y += instrlen.o
obj-y += intercept.o
obj-y += io.o
obj-y += platform.o
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/hvm.c Tue Sep 26 19:11:33 2006 -0600
@@ -337,6 +337,33 @@ int cpu_get_interrupt(struct vcpu *v, in
return -1;
}
+static void hvm_vcpu_down(void)
+{
+ struct vcpu *v = current;
+ struct domain *d = v->domain;
+ int online_count = 0;
+
+ DPRINTK("DOM%d/VCPU%d: going offline.\n", d->domain_id, v->vcpu_id);
+
+ /* Doesn't halt us immediately, but we'll never return to guest context. */
+ set_bit(_VCPUF_down, &v->vcpu_flags);
+ vcpu_sleep_nosync(v);
+
+ /* Any other VCPUs online? ... */
+ LOCK_BIGLOCK(d);
+ for_each_vcpu ( d, v )
+ if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
+ online_count++;
+ UNLOCK_BIGLOCK(d);
+
+ /* ... Shut down the domain if not. */
+ if ( online_count == 0 )
+ {
+ DPRINTK("DOM%d: all CPUs offline -- powering off.\n", d->domain_id);
+ domain_shutdown(d, SHUTDOWN_poweroff);
+ }
+}
+
void hvm_hlt(unsigned long rflags)
{
struct vcpu *v = current;
@@ -344,18 +371,12 @@ void hvm_hlt(unsigned long rflags)
s_time_t next_pit = -1, next_wakeup;
/*
- * Detect machine shutdown. Only do this for vcpu 0, to avoid potentially
- * shutting down the domain early. If we halt with interrupts disabled,
- * that's a pretty sure sign that we want to shut down. In a real
- * processor, NMIs are the only way to break out of this.
+ * If we halt with interrupts disabled, that's a pretty sure sign that we
+ * want to shut down. In a real processor, NMIs are the only way to break
+ * out of this.
*/
- if ( (v->vcpu_id == 0) && !(rflags & X86_EFLAGS_IF) )
- {
- printk("D%d: HLT with interrupts disabled -- shutting down.\n",
- current->domain->domain_id);
- domain_shutdown(current->domain, SHUTDOWN_poweroff);
- return;
- }
+ if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
+ return hvm_vcpu_down();
if ( !v->vcpu_id )
next_pit = get_scheduled(v, pt->irq, pt);
@@ -578,17 +599,20 @@ int hvm_bringup_ap(int vcpuid, int tramp
struct vcpu_guest_context *ctxt;
int rc = 0;
- /* current must be HVM domain BSP */
- if ( !(hvm_guest(bsp) && bsp->vcpu_id == 0) ) {
- printk("Not calling hvm_bringup_ap from BSP context.\n");
+ BUG_ON(!hvm_guest(bsp));
+
+ if ( bsp->vcpu_id != 0 )
+ {
+ DPRINTK("Not calling hvm_bringup_ap from BSP context.\n");
domain_crash_synchronous();
}
if ( (v = d->vcpu[vcpuid]) == NULL )
return -ENOENT;
- if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) {
- printk("Failed to allocate memory in hvm_bringup_ap.\n");
+ if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL )
+ {
+ DPRINTK("Failed to allocate memory in hvm_bringup_ap.\n");
return -ENOMEM;
}
@@ -601,12 +625,14 @@ int hvm_bringup_ap(int vcpuid, int tramp
UNLOCK_BIGLOCK(d);
if ( rc != 0 )
- printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
- else {
- if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
- vcpu_wake(d->vcpu[vcpuid]);
- printk("AP %d bringup suceeded.\n", vcpuid);
- }
+ {
+ DPRINTK("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
+ return rc;
+ }
+
+ if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
+ vcpu_wake(d->vcpu[vcpuid]);
+ DPRINTK("AP %d bringup suceeded.\n", vcpuid);
xfree(ctxt);
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/i8259.c
--- a/xen/arch/x86/hvm/i8259.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/i8259.c Tue Sep 26 19:11:33 2006 -0600
@@ -447,6 +447,10 @@ static void pic_init1(int io_addr, int e
ASSERT(spin_is_locked(&s->pics_state->lock));
pic_reset(s);
+
+ /* XXX We set the ELCR to level triggered here, but that should
+ really be done by the BIOS, and only for PCI IRQs. */
+ s->elcr = 0xff & s->elcr_mask;
}
void pic_init(struct hvm_virpic *s, void (*irq_request)(void *, int),
@@ -458,12 +462,12 @@ void pic_init(struct hvm_virpic *s, void
spin_lock_init(&s->lock);
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
+ s->pics[0].elcr_mask = 0xf8;
+ s->pics[1].elcr_mask = 0xde;
spin_lock_irqsave(&s->lock, flags);
pic_init1(0x20, 0x4d0, &s->pics[0]);
pic_init1(0xa0, 0x4d1, &s->pics[1]);
spin_unlock_irqrestore(&s->lock, flags);
- s->pics[0].elcr_mask = 0xf8;
- s->pics[1].elcr_mask = 0xde;
s->irq_request = irq_request;
s->irq_request_opaque = irq_request_opaque;
}
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/platform.c Tue Sep 26 19:11:33 2006 -0600
@@ -52,7 +52,7 @@ static inline long __get_reg_value(unsig
case QUAD:
return (long)(reg);
default:
- printf("Error: (__get_reg_value) Invalid reg size\n");
+ printk("Error: (__get_reg_value) Invalid reg size\n");
domain_crash_synchronous();
}
}
@@ -78,7 +78,7 @@ long get_reg_value(int size, int index,
case 7: /* %bh */
return (char)((regs->rbx & 0xFF00) >> 8);
default:
- printf("Error: (get_reg_value) Invalid index value\n");
+ printk("Error: (get_reg_value) Invalid index value\n");
domain_crash_synchronous();
}
/* NOTREACHED */
@@ -102,7 +102,7 @@ long get_reg_value(int size, int index,
case 14: return __get_reg_value(regs->r14, size);
case 15: return __get_reg_value(regs->r15, size);
default:
- printf("Error: (get_reg_value) Invalid index value\n");
+ printk("Error: (get_reg_value) Invalid index value\n");
domain_crash_synchronous();
}
}
@@ -115,7 +115,7 @@ static inline long __get_reg_value(unsig
case LONG:
return (int)(reg & 0xFFFFFFFF);
default:
- printf("Error: (__get_reg_value) Invalid reg size\n");
+ printk("Error: (__get_reg_value) Invalid reg size\n");
domain_crash_synchronous();
}
}
@@ -141,7 +141,7 @@ long get_reg_value(int size, int index,
case 7: /* %bh */
return (char)((regs->ebx & 0xFF00) >> 8);
default:
- printf("Error: (get_reg_value) Invalid index value\n");
+ printk("Error: (get_reg_value) Invalid index value\n");
domain_crash_synchronous();
}
}
@@ -156,7 +156,7 @@ long get_reg_value(int size, int index,
case 6: return __get_reg_value(regs->esi, size);
case 7: return __get_reg_value(regs->edi, size);
default:
- printf("Error: (get_reg_value) Invalid index value\n");
+ printk("Error: (get_reg_value) Invalid index value\n");
domain_crash_synchronous();
}
}
@@ -464,7 +464,7 @@ static int hvm_decode(int realmode, unsi
return DECODE_success;
default:
- printf("%x/%x, This opcode isn't handled yet!\n",
+ printk("%x/%x, This opcode isn't handled yet!\n",
*opcode, ins_subtype);
return DECODE_failure;
}
@@ -614,7 +614,7 @@ static int hvm_decode(int realmode, unsi
break;
default:
- printf("%x, This opcode isn't handled yet!\n", *opcode);
+ printk("%x, This opcode isn't handled yet!\n", *opcode);
return DECODE_failure;
}
@@ -675,12 +675,12 @@ static int hvm_decode(int realmode, unsi
}
else
{
- printf("0f %x, This opcode subtype isn't handled yet\n", *opcode);
+ printk("0f %x, This opcode subtype isn't handled yet\n", *opcode);
return DECODE_failure;
}
default:
- printf("0f %x, This opcode isn't handled yet\n", *opcode);
+ printk("0f %x, This opcode isn't handled yet\n", *opcode);
return DECODE_failure;
}
}
@@ -702,7 +702,7 @@ static void hvm_send_assist_req(struct v
if ( unlikely(p->state != STATE_INVALID) ) {
/* This indicates a bug in the device model. Crash the
domain. */
- printf("Device model set bad IO state %d.\n", p->state);
+ printk("Device model set bad IO state %d.\n", p->state);
domain_crash(v->domain);
return;
}
@@ -733,7 +733,7 @@ void send_pio_req(struct cpu_user_regs *
p = &vio->vp_ioreq;
if ( p->state != STATE_INVALID )
- printf("WARNING: send pio with something already pending (%d)?\n",
+ printk("WARNING: send pio with something already pending (%d)?\n",
p->state);
p->dir = dir;
p->pdata_valid = pvalid;
@@ -776,14 +776,14 @@ void send_mmio_req(
vio = get_vio(v->domain, v->vcpu_id);
if (vio == NULL) {
- printf("bad shared page\n");
+ printk("bad shared page\n");
domain_crash_synchronous();
}
p = &vio->vp_ioreq;
if ( p->state != STATE_INVALID )
- printf("WARNING: send mmio with something already pending (%d)?\n",
+ printk("WARNING: send mmio with something already pending (%d)?\n",
p->state);
p->dir = dir;
p->pdata_valid = pvalid;
@@ -841,7 +841,7 @@ static void mmio_operands(int type, unsi
else
send_mmio_req(type, gpa, 1, inst->op_size, 0, IOREQ_READ, 0);
} else {
- printf("mmio_operands: invalid operand\n");
+ printk("mmio_operands: invalid operand\n");
domain_crash_synchronous();
}
}
@@ -866,8 +866,10 @@ void handle_mmio(unsigned long va, unsig
memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES);
hvm_store_cpu_guest_regs(v, regs, NULL);
- if ((inst_len = hvm_instruction_length(v)) <= 0) {
- printf("handle_mmio: failed to get instruction length\n");
+ inst_len = hvm_instruction_length(regs, hvm_guest_x86_mode(v));
+ if ( inst_len <= 0 )
+ {
+ printk("handle_mmio: failed to get instruction length\n");
domain_crash_synchronous();
}
@@ -880,19 +882,19 @@ void handle_mmio(unsigned long va, unsig
memset(inst, 0, MAX_INST_LEN);
ret = inst_copy_from_guest(inst, inst_addr, inst_len);
if (ret != inst_len) {
- printf("handle_mmio: failed to copy instruction\n");
+ printk("handle_mmio: failed to copy instruction\n");
domain_crash_synchronous();
}
init_instruction(&mmio_inst);
if (hvm_decode(realmode, inst, &mmio_inst) == DECODE_failure) {
- printf("handle_mmio: failed to decode instruction\n");
- printf("mmio opcode: va 0x%lx, gpa 0x%lx, len %d:",
+ printk("handle_mmio: failed to decode instruction\n");
+ printk("mmio opcode: va 0x%lx, gpa 0x%lx, len %d:",
va, gpa, inst_len);
for (i = 0; i < inst_len; i++)
- printf(" %02x", inst[i] & 0xFF);
- printf("\n");
+ printk(" %02x", inst[i] & 0xFF);
+ printk("\n");
domain_crash_synchronous();
}
@@ -1073,7 +1075,7 @@ void handle_mmio(unsigned long va, unsig
break;
default:
- printf("Unhandled MMIO instruction\n");
+ printk("Unhandled MMIO instruction\n");
domain_crash_synchronous();
}
}
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/svm/Makefile
--- a/xen/arch/x86/hvm/svm/Makefile Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/svm/Makefile Tue Sep 26 19:11:33 2006 -0600
@@ -2,7 +2,6 @@ subdir-$(x86_64) += x86_64
subdir-$(x86_64) += x86_64
obj-y += emulate.o
-obj-y += instrlen.o
obj-y += intr.o
obj-y += svm.o
obj-y += vmcb.o
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/svm/svm.c Tue Sep 26 19:11:33 2006 -0600
@@ -44,6 +44,7 @@
#include <asm/hvm/svm/emulate.h>
#include <asm/hvm/svm/vmmcall.h>
#include <asm/hvm/svm/intr.h>
+#include <asm/x86_emulate.h>
#include <public/sched.h>
#define SVM_EXTRA_DEBUG
@@ -60,7 +61,6 @@ extern asmlinkage void do_IRQ(struct cpu
extern asmlinkage void do_IRQ(struct cpu_user_regs *);
extern void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
unsigned long count, int size, long value, int dir,
int pvalid);
-extern int svm_instrlen(struct cpu_user_regs *regs, int mode);
extern void svm_dump_inst(unsigned long eip);
extern int svm_dbg_on;
void svm_dump_regs(const char *from, struct cpu_user_regs *regs);
@@ -468,21 +468,19 @@ static int svm_realmode(struct vcpu *v)
return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
}
-int svm_guest_x86_mode(struct vcpu *v)
+static int svm_guest_x86_mode(struct vcpu *v)
{
struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
- unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
- /* check which operating mode the guest is running */
- if( vmcb->efer & EFER_LMA )
- mode = vmcb->cs.attributes.fields.l ? 8 : 4;
- else
- mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
- return mode;
-}
-
-int svm_instruction_length(struct vcpu *v)
-{
- return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+
+ if ( vmcb->efer & EFER_LMA )
+ return (vmcb->cs.attributes.fields.l ?
+ X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
+
+ if ( svm_realmode(v) )
+ return X86EMUL_MODE_REAL;
+
+ return (vmcb->cs.attributes.fields.db ?
+ X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
}
void svm_update_host_cr3(struct vcpu *v)
@@ -878,7 +876,6 @@ int start_svm(void)
hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
hvm_funcs.pae_enabled = svm_pae_enabled;
hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
- hvm_funcs.instruction_length = svm_instruction_length;
hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
hvm_funcs.update_host_cr3 = svm_update_host_cr3;
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Tue Sep 26 19:11:33 2006 -0600
@@ -37,36 +37,119 @@
#include <xen/keyhandler.h>
#include <asm/shadow.h>
-static int vmcs_size;
-static int vmcs_order;
+/* Basic flags for Pin-based VM-execution controls. */
+#define MONITOR_PIN_BASED_EXEC_CONTROLS \
+ ( PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING )
+
+/* Basic flags for CPU-based VM-execution controls. */
+#ifdef __x86_64__
+#define MONITOR_CPU_BASED_EXEC_CONTROLS_SUBARCH \
+ ( CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING )
+#else
+#define MONITOR_CPU_BASED_EXEC_CONTROLS_SUBARCH 0
+#endif
+#define MONITOR_CPU_BASED_EXEC_CONTROLS \
+ ( MONITOR_CPU_BASED_EXEC_CONTROLS_SUBARCH | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_INVDPG_EXITING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_ACTIVATE_IO_BITMAP | \
+ CPU_BASED_USE_TSC_OFFSETING )
+
+/* Basic flags for VM-Exit controls. */
+#ifdef __x86_64__
+#define MONITOR_VM_EXIT_CONTROLS_SUBARCH VM_EXIT_IA32E_MODE
+#else
+#define MONITOR_VM_EXIT_CONTROLS_SUBARCH 0
+#endif
+#define MONITOR_VM_EXIT_CONTROLS \
+ ( MONITOR_VM_EXIT_CONTROLS_SUBARCH | \
+ VM_EXIT_ACK_INTR_ON_EXIT )
+
+/* Basic flags for VM-Entry controls. */
+#define MONITOR_VM_ENTRY_CONTROLS 0x00000000
+
+/* Dynamic (run-time adjusted) execution control flags. */
+static u32 vmx_pin_based_exec_control;
+static u32 vmx_cpu_based_exec_control;
+static u32 vmx_vmexit_control;
+static u32 vmx_vmentry_control;
+
static u32 vmcs_revision_id;
+static u32 adjust_vmx_controls(u32 ctrls, u32 msr)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ /* Bit == 0 means must be zero. */
+ BUG_ON(ctrls & ~vmx_msr_high);
+
+ /* Bit == 1 means must be one. */
+ ctrls |= vmx_msr_low;
+
+ return ctrls;
+}
+
void vmx_init_vmcs_config(void)
{
u32 vmx_msr_low, vmx_msr_high;
-
- if ( vmcs_size )
- return;
+ u32 _vmx_pin_based_exec_control;
+ u32 _vmx_cpu_based_exec_control;
+ u32 _vmx_vmexit_control;
+ u32 _vmx_vmentry_control;
+
+ _vmx_pin_based_exec_control =
+ adjust_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
+ MSR_IA32_VMX_PINBASED_CTLS_MSR);
+ _vmx_cpu_based_exec_control =
+ adjust_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
+ MSR_IA32_VMX_PROCBASED_CTLS_MSR);
+ _vmx_vmexit_control =
+ adjust_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS_MSR);
+ _vmx_vmentry_control =
+ adjust_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS_MSR);
rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
- vmcs_revision_id = vmx_msr_low;
-
- vmcs_size = vmx_msr_high & 0x1fff;
- vmcs_order = get_order_from_bytes(vmcs_size);
+ if ( smp_processor_id() == 0 )
+ {
+ vmcs_revision_id = vmx_msr_low;
+ vmx_pin_based_exec_control = _vmx_pin_based_exec_control;
+ vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control;
+ vmx_vmexit_control = _vmx_vmexit_control;
+ vmx_vmentry_control = _vmx_vmentry_control;
+ }
+ else
+ {
+ BUG_ON(vmcs_revision_id != vmx_msr_low);
+ BUG_ON(vmx_pin_based_exec_control != _vmx_pin_based_exec_control);
+ BUG_ON(vmx_cpu_based_exec_control != _vmx_cpu_based_exec_control);
+ BUG_ON(vmx_vmexit_control != _vmx_vmexit_control);
+ BUG_ON(vmx_vmentry_control != _vmx_vmentry_control);
+ }
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ BUG_ON((vmx_msr_high & 0x1fff) > PAGE_SIZE);
}
static struct vmcs_struct *vmx_alloc_vmcs(void)
{
struct vmcs_struct *vmcs;
- if ( (vmcs = alloc_xenheap_pages(vmcs_order)) == NULL )
+ if ( (vmcs = alloc_xenheap_page()) == NULL )
{
DPRINTK("Failed to allocate VMCS.\n");
return NULL;
}
- memset(vmcs, 0, vmcs_size); /* don't remove this */
+ memset(vmcs, 0, PAGE_SIZE);
vmcs->vmcs_revision_id = vmcs_revision_id;
return vmcs;
@@ -74,7 +157,7 @@ static struct vmcs_struct *vmx_alloc_vmc
static void vmx_free_vmcs(struct vmcs_struct *vmcs)
{
- free_xenheap_pages(vmcs, vmcs_order);
+ free_xenheap_page(vmcs);
}
static void __vmx_clear_vmcs(void *info)
@@ -156,12 +239,11 @@ static inline int construct_vmcs_control
{
int error = 0;
- error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
- MONITOR_PIN_BASED_EXEC_CONTROLS);
-
- error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
-
- error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
+ error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
+
+ error |= __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
+
+ error |= __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
error |= __vmwrite(IO_BITMAP_A, virt_to_maddr(arch_vmx->io_bitmap_a));
error |= __vmwrite(IO_BITMAP_B, virt_to_maddr(arch_vmx->io_bitmap_b));
@@ -246,9 +328,8 @@ static void vmx_do_launch(struct vcpu *v
error |= __vmwrite(GUEST_CR0, cr0);
cr0 &= ~X86_CR0_PG;
error |= __vmwrite(CR0_READ_SHADOW, cr0);
- error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
- MONITOR_CPU_BASED_EXEC_CONTROLS);
- v->arch.hvm_vcpu.u.vmx.exec_control = MONITOR_CPU_BASED_EXEC_CONTROLS;
+ error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
+ v->arch.hvm_vcpu.u.vmx.exec_control = vmx_cpu_based_exec_control;
__asm__ __volatile__ ("mov %%cr4,%0" : "=r" (cr4) : );
@@ -297,21 +378,21 @@ static inline int construct_init_vmcs_gu
/* MSR */
error |= __vmwrite(VM_EXIT_MSR_LOAD_ADDR, 0);
error |= __vmwrite(VM_EXIT_MSR_STORE_ADDR, 0);
-
error |= __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
error |= __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
error |= __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
- /* interrupt */
+
error |= __vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
- /* mask */
- error |= __vmwrite(CR0_GUEST_HOST_MASK, -1UL);
- error |= __vmwrite(CR4_GUEST_HOST_MASK, -1UL);
+
+ error |= __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
+ error |= __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
- /* TSC */
error |= __vmwrite(CR3_TARGET_COUNT, 0);
+
+ error |= __vmwrite(GUEST_ACTIVITY_STATE, 0);
/* Guest Selectors */
error |= __vmwrite(GUEST_ES_SELECTOR, GUEST_LAUNCH_DS);
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Sep 26 19:11:33 2006 -0600
@@ -45,6 +45,7 @@
#include <public/hvm/ioreq.h>
#include <asm/hvm/vpic.h>
#include <asm/hvm/vlapic.h>
+#include <asm/x86_emulate.h>
extern uint32_t vlapic_update_ppr(struct vlapic *vlapic);
@@ -593,15 +594,6 @@ static void vmx_load_cpu_guest_regs(stru
vmx_vmcs_exit(v);
}
-static int vmx_instruction_length(struct vcpu *v)
-{
- unsigned long inst_len;
-
- if ( __vmread(VM_EXIT_INSTRUCTION_LEN, &inst_len) ) /* XXX Unsafe XXX */
- return 0;
- return inst_len;
-}
-
static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
{
switch ( num )
@@ -692,21 +684,6 @@ static void vmx_init_ap_context(struct v
void do_nmi(struct cpu_user_regs *);
-static int check_vmx_controls(u32 ctrls, u32 msr)
-{
- u32 vmx_msr_low, vmx_msr_high;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
- if ( (ctrls < vmx_msr_low) || (ctrls > vmx_msr_high) )
- {
- printk("Insufficient VMX capability 0x%x, "
- "msr=0x%x,low=0x%8x,high=0x%x\n",
- ctrls, msr, vmx_msr_low, vmx_msr_high);
- return 0;
- }
- return 1;
-}
-
static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
{
char *p;
@@ -729,6 +706,35 @@ static void vmx_init_hypercall_page(stru
*(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
}
+static int vmx_realmode(struct vcpu *v)
+{
+ unsigned long rflags;
+
+ ASSERT(v == current);
+
+ __vmread(GUEST_RFLAGS, &rflags);
+ return rflags & X86_EFLAGS_VM;
+}
+
+static int vmx_guest_x86_mode(struct vcpu *v)
+{
+ unsigned long cs_ar_bytes;
+
+ ASSERT(v == current);
+
+ __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+
+ if ( vmx_long_mode_enabled(v) )
+ return ((cs_ar_bytes & (1u<<13)) ?
+ X86EMUL_MODE_PROT64 : X86EMUL_MODE_PROT32);
+
+ if ( vmx_realmode(v) )
+ return X86EMUL_MODE_REAL;
+
+ return ((cs_ar_bytes & (1u<<14)) ?
+ X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16);
+}
+
/* Setup HVM interfaces */
static void vmx_setup_hvm_funcs(void)
{
@@ -748,7 +754,6 @@ static void vmx_setup_hvm_funcs(void)
hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
hvm_funcs.pae_enabled = vmx_pae_enabled;
hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
- hvm_funcs.instruction_length = vmx_instruction_length;
hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
@@ -771,7 +776,7 @@ int start_vmx(void)
*/
boot_cpu_data.x86_capability[4] = cpuid_ecx(1);
- if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
+ if ( !test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability) )
return 0;
rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx);
@@ -791,24 +796,11 @@ int start_vmx(void)
IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
}
- if ( !check_vmx_controls(MONITOR_PIN_BASED_EXEC_CONTROLS,
- MSR_IA32_VMX_PINBASED_CTLS_MSR) )
- return 0;
- if ( !check_vmx_controls(MONITOR_CPU_BASED_EXEC_CONTROLS,
- MSR_IA32_VMX_PROCBASED_CTLS_MSR) )
- return 0;
- if ( !check_vmx_controls(MONITOR_VM_EXIT_CONTROLS,
- MSR_IA32_VMX_EXIT_CTLS_MSR) )
- return 0;
- if ( !check_vmx_controls(MONITOR_VM_ENTRY_CONTROLS,
- MSR_IA32_VMX_ENTRY_CTLS_MSR) )
- return 0;
-
set_in_cr4(X86_CR4_VMXE);
vmx_init_vmcs_config();
-
- if(!smp_processor_id())
+
+ if ( smp_processor_id() == 0 )
setup_vmcs_dump();
if ( (vmcs = vmx_alloc_host_vmcs()) == NULL )
@@ -1499,7 +1491,7 @@ static int vmx_set_cr0(unsigned long val
&v->arch.hvm_vmx.cpu_state);
__vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
- vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
+ vm_entry_value |= VM_ENTRY_IA32E_MODE;
__vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
}
#endif
@@ -1553,7 +1545,7 @@ static int vmx_set_cr0(unsigned long val
clear_bit(VMX_CPU_STATE_LMA_ENABLED,
&v->arch.hvm_vmx.cpu_state);
__vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
- vm_entry_value &= ~VM_ENTRY_CONTROLS_IA32E_MODE;
+ vm_entry_value &= ~VM_ENTRY_IA32E_MODE;
__vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
}
}
@@ -2276,15 +2268,8 @@ asmlinkage void vmx_vmexit_handler(struc
domain_crash_synchronous();
break;
case EXIT_REASON_PENDING_INTERRUPT:
- /*
- * Not sure exactly what the purpose of this is. The only bits set
- * and cleared at this point are CPU_BASED_VIRTUAL_INTR_PENDING.
- * (in io.c:{enable,disable}_irq_window(). So presumably we want to
- * set it to the original value...
- */
+ /* Disable the interrupt window. */
v->arch.hvm_vcpu.u.vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- v->arch.hvm_vcpu.u.vmx.exec_control |=
- (MONITOR_CPU_BASED_EXEC_CONTROLS & CPU_BASED_VIRTUAL_INTR_PENDING);
__vmwrite(CPU_BASED_VM_EXEC_CONTROL,
v->arch.hvm_vcpu.u.vmx.exec_control);
break;
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/irq.c Tue Sep 26 19:11:33 2006 -0600
@@ -351,11 +351,15 @@ int pirq_acktype(int irq)
desc = &irq_desc[vector];
+ if ( desc->handler == &no_irq_type )
+ return ACKTYPE_NONE;
+
/*
- * Edge-triggered IO-APIC interrupts need no final acknowledgement:
- * we ACK early during interrupt processing.
+ * Edge-triggered IO-APIC and LAPIC interrupts need no final
+ * acknowledgement: we ACK early during interrupt processing.
*/
- if ( !strcmp(desc->handler->typename, "IO-APIC-edge") )
+ if ( !strcmp(desc->handler->typename, "IO-APIC-edge") ||
+ !strcmp(desc->handler->typename, "local-APIC-edge") )
return ACKTYPE_NONE;
/*
@@ -376,7 +380,9 @@ int pirq_acktype(int irq)
return ACKTYPE_NONE; /* edge-triggered => no final EOI */
}
+ printk("Unknown PIC type '%s' for IRQ %d\n", desc->handler->typename, irq);
BUG();
+
return 0;
}
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/arch/x86/setup.c Tue Sep 26 19:11:33 2006 -0600
@@ -272,6 +272,13 @@ void __init __start_xen(multiboot_info_t
EARLY_FAIL();
}
+ /*
+ * Since there are some stubs getting built on the stacks which use
+ * direct calls/jumps, the heap must be confined to the lower 2G so
+ * that those branches can reach their targets.
+ */
+ if ( opt_xenheap_megabytes > 2048 )
+ opt_xenheap_megabytes = 2048;
xenheap_phys_end = opt_xenheap_megabytes << 20;
if ( mbi->flags & MBI_MEMMAP )
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/common/domain.c
--- a/xen/common/domain.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/common/domain.c Tue Sep 26 19:11:33 2006 -0600
@@ -82,20 +82,16 @@ struct vcpu *alloc_vcpu(
v->domain = d;
v->vcpu_id = vcpu_id;
- v->processor = cpu_id;
v->vcpu_info = &d->shared_info->vcpu_info[vcpu_id];
spin_lock_init(&v->pause_lock);
- v->cpu_affinity = is_idle_domain(d) ?
- cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
-
v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
v->runstate.state_entry_time = NOW();
if ( (vcpu_id != 0) && !is_idle_domain(d) )
set_bit(_VCPUF_down, &v->vcpu_flags);
- if ( sched_init_vcpu(v) < 0 )
+ if ( sched_init_vcpu(v, cpu_id) < 0 )
{
free_vcpu_struct(v);
return NULL;
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/common/gdbstub.c
--- a/xen/common/gdbstub.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/common/gdbstub.c Tue Sep 26 19:11:33 2006 -0600
@@ -53,6 +53,8 @@ static char opt_gdb[30] = "none";
static char opt_gdb[30] = "none";
string_param("gdb", opt_gdb);
+static void gdbstub_console_puts(const char *str);
+
/* value <-> char (de)serialzers */
char
hex2char(unsigned long x)
@@ -360,7 +362,6 @@ static void
static void
gdbstub_attach(struct gdb_context *ctx)
{
- static void gdbstub_console_puts(const char *str);
if ( ctx->currently_attached )
return;
ctx->currently_attached = 1;
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/common/schedule.c
--- a/xen/common/schedule.c Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/common/schedule.c Tue Sep 26 19:11:33 2006 -0600
@@ -37,6 +37,10 @@ static char opt_sched[10] = "credit";
static char opt_sched[10] = "credit";
string_param("sched", opt_sched);
+/* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
+static unsigned int opt_dom0_vcpus_pin;
+boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
+
#define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */
/* Various timer handlers. */
@@ -97,13 +101,26 @@ void vcpu_runstate_get(struct vcpu *v, s
}
}
-int sched_init_vcpu(struct vcpu *v)
-{
+int sched_init_vcpu(struct vcpu *v, unsigned int processor)
+{
+ struct domain *d = v->domain;
+
+ /*
+ * Initialize processor and affinity settings. The idler, and potentially
+ * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+ */
+ v->processor = processor;
+ if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) )
+ v->cpu_affinity = cpumask_of_cpu(processor);
+ else
+ v->cpu_affinity = CPU_MASK_ALL;
+
/* Initialise the per-domain timers. */
init_timer(&v->timer, vcpu_timer_fn, v, v->processor);
init_timer(&v->poll_timer, poll_timer_fn, v, v->processor);
- if ( is_idle_vcpu(v) )
+ /* Idle VCPUs are scheduled immediately. */
+ if ( is_idle_domain(d) )
{
per_cpu(schedule_data, v->processor).curr = v;
per_cpu(schedule_data, v->processor).idle = v;
@@ -211,6 +228,9 @@ int vcpu_set_affinity(struct vcpu *v, cp
{
cpumask_t online_affinity;
unsigned long flags;
+
+ if ( (v->domain->domain_id == 0) && opt_dom0_vcpus_pin )
+ return -EINVAL;
cpus_and(online_affinity, *affinity, cpu_online_map);
if ( cpus_empty(online_affinity) )
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/include/asm-x86/hvm/hvm.h Tue Sep 26 19:11:33 2006 -0600
@@ -51,15 +51,13 @@ struct hvm_function_table {
* Examine specifics of the guest state:
* 1) determine whether the guest is in real or vm8086 mode,
* 2) determine whether paging is enabled,
- * 3) return the length of the instruction that caused an exit.
- * 4) return the current guest control-register value
+ * 3) return the current guest control-register value
*/
int (*realmode)(struct vcpu *v);
int (*paging_enabled)(struct vcpu *v);
int (*long_mode_enabled)(struct vcpu *v);
int (*pae_enabled)(struct vcpu *v);
int (*guest_x86_mode)(struct vcpu *v);
- int (*instruction_length)(struct vcpu *v);
unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
/*
@@ -159,11 +157,7 @@ hvm_guest_x86_mode(struct vcpu *v)
return hvm_funcs.guest_x86_mode(v);
}
-static inline int
-hvm_instruction_length(struct vcpu *v)
-{
- return hvm_funcs.instruction_length(v);
-}
+int hvm_instruction_length(struct cpu_user_regs *regs, int mode);
static inline void
hvm_update_host_cr3(struct vcpu *v)
@@ -182,9 +176,9 @@ hvm_get_guest_ctrl_reg(struct vcpu *v, u
return 0; /* force to fail */
}
-extern void hvm_stts(struct vcpu *v);
-extern void hvm_set_guest_time(struct vcpu *v, u64 gtime);
-extern void hvm_do_resume(struct vcpu *v);
+void hvm_stts(struct vcpu *v);
+void hvm_set_guest_time(struct vcpu *v, u64 gtime);
+void hvm_do_resume(struct vcpu *v);
static inline void
hvm_init_ap_context(struct vcpu_guest_context *ctxt,
@@ -193,6 +187,6 @@ hvm_init_ap_context(struct vcpu_guest_co
return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector);
}
-extern int hvm_bringup_ap(int vcpuid, int trampoline_vector);
+int hvm_bringup_ap(int vcpuid, int trampoline_vector);
#endif /* __ASM_X86_HVM_HVM_H__ */
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Sep 26 19:11:33 2006 -0600
@@ -132,12 +132,16 @@ extern int vmcs_version;
#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
-#define PIN_BASED_EXT_INTR_MASK 0x1
-#define PIN_BASED_NMI_EXITING 0x8
-
+
+#define PIN_BASED_EXT_INTR_MASK 0x00000001
+#define PIN_BASED_NMI_EXITING 0x00000008
+
+#define VM_EXIT_IA32E_MODE 0x00000200
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
-#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200
-
+
+#define VM_ENTRY_IA32E_MODE 0x00000200
+#define VM_ENTRY_SMM 0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
/* VMCS Encordings */
enum vmcs_field {
@@ -217,6 +221,7 @@ enum vmcs_field {
GUEST_LDTR_AR_BYTES = 0x00004820,
GUEST_TR_AR_BYTES = 0x00004822,
GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ GUEST_ACTIVITY_STATE = 0x00004826,
GUEST_SYSENTER_CS = 0x0000482A,
HOST_IA32_SYSENTER_CS = 0x00004c00,
CR0_GUEST_HOST_MASK = 0x00006000,
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Tue Sep 26 19:11:33 2006 -0600
@@ -35,73 +35,6 @@ extern void set_guest_time(struct vcpu *
extern void set_guest_time(struct vcpu *v, u64 gtime);
extern unsigned int cpu_rev;
-
-/*
- * Need fill bits for SENTER
- */
-
-#define MONITOR_PIN_BASED_EXEC_CONTROLS_RESERVED_VALUE 0x00000016
-
-#define MONITOR_PIN_BASED_EXEC_CONTROLS \
- ( \
- MONITOR_PIN_BASED_EXEC_CONTROLS_RESERVED_VALUE | \
- PIN_BASED_EXT_INTR_MASK | \
- PIN_BASED_NMI_EXITING \
- )
-
-#define MONITOR_CPU_BASED_EXEC_CONTROLS_RESERVED_VALUE 0x0401e172
-
-#define _MONITOR_CPU_BASED_EXEC_CONTROLS \
- ( \
- MONITOR_CPU_BASED_EXEC_CONTROLS_RESERVED_VALUE | \
- CPU_BASED_HLT_EXITING | \
- CPU_BASED_INVDPG_EXITING | \
- CPU_BASED_MWAIT_EXITING | \
- CPU_BASED_MOV_DR_EXITING | \
- CPU_BASED_ACTIVATE_IO_BITMAP | \
- CPU_BASED_USE_TSC_OFFSETING \
- )
-
-#define MONITOR_CPU_BASED_EXEC_CONTROLS_IA32E_MODE \
- ( \
- CPU_BASED_CR8_LOAD_EXITING | \
- CPU_BASED_CR8_STORE_EXITING \
- )
-
-#define MONITOR_VM_EXIT_CONTROLS_RESERVED_VALUE 0x0003edff
-
-#define MONITOR_VM_EXIT_CONTROLS_IA32E_MODE 0x00000200
-
-#define _MONITOR_VM_EXIT_CONTROLS \
- ( \
- MONITOR_VM_EXIT_CONTROLS_RESERVED_VALUE | \
- VM_EXIT_ACK_INTR_ON_EXIT \
- )
-
-#if defined (__x86_64__)
-#define MONITOR_CPU_BASED_EXEC_CONTROLS \
- ( \
- _MONITOR_CPU_BASED_EXEC_CONTROLS | \
- MONITOR_CPU_BASED_EXEC_CONTROLS_IA32E_MODE \
- )
-#define MONITOR_VM_EXIT_CONTROLS \
- ( \
- _MONITOR_VM_EXIT_CONTROLS | \
- MONITOR_VM_EXIT_CONTROLS_IA32E_MODE \
- )
-#else
-#define MONITOR_CPU_BASED_EXEC_CONTROLS \
- _MONITOR_CPU_BASED_EXEC_CONTROLS
-
-#define MONITOR_VM_EXIT_CONTROLS \
- _MONITOR_VM_EXIT_CONTROLS
-#endif
-
-#define VM_ENTRY_CONTROLS_RESERVED_VALUE 0x000011ff
-#define VM_ENTRY_CONTROLS_IA32E_MODE 0x00000200
-
-#define MONITOR_VM_ENTRY_CONTROLS \
- VM_ENTRY_CONTROLS_RESERVED_VALUE
/*
* Exit Reasons
@@ -425,36 +358,10 @@ static inline int vmx_pae_enabled(struct
}
/* Works only for vcpu == current */
-static inline int vmx_realmode(struct vcpu *v)
-{
- unsigned long rflags;
- ASSERT(v == current);
-
- __vmread(GUEST_RFLAGS, &rflags);
- return rflags & X86_EFLAGS_VM;
-}
-
-/* Works only for vcpu == current */
static inline void vmx_update_host_cr3(struct vcpu *v)
{
ASSERT(v == current);
__vmwrite(HOST_CR3, v->arch.cr3);
-}
-
-static inline int vmx_guest_x86_mode(struct vcpu *v)
-{
- unsigned long cs_ar_bytes;
- ASSERT(v == current);
-
- if ( vmx_long_mode_enabled(v) )
- {
- __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
- return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
- }
- if ( vmx_realmode(v) )
- return 2;
- __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
- return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
}
static inline int vmx_pgbit_test(struct vcpu *v)
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/include/xen/compiler.h
--- a/xen/include/xen/compiler.h Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/include/xen/compiler.h Tue Sep 26 19:11:33 2006 -0600
@@ -35,7 +35,7 @@
#define offsetof(a,b) ((unsigned long)&(((a *)0)->b))
#endif
-#if defined(__x86_64__) && (__GNUC__ > 3)
+#ifdef GCC_HAS_VISIBILITY_ATTRIBUTE
/* Results in more efficient PIC code (no indirections through GOT or PLT). */
#pragma GCC visibility push(hidden)
#endif
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Tue Sep 26 16:15:45 2006 -0600
+++ b/xen/include/xen/sched.h Tue Sep 26 19:11:33 2006 -0600
@@ -280,7 +280,7 @@ void new_thread(struct vcpu *d,
#define set_current_state(_s) do { current->state = (_s); } while (0)
void scheduler_init(void);
void schedulers_start(void);
-int sched_init_vcpu(struct vcpu *);
+int sched_init_vcpu(struct vcpu *v, unsigned int processor);
void sched_destroy_domain(struct domain *);
long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
int sched_id(void);
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/instrlen.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/instrlen.c Tue Sep 26 19:11:33 2006 -0600
@@ -0,0 +1,474 @@
+/*
+ * instrlen.c - calculates the instruction length for all operating modes
+ *
+ * Travis Betak, travis.betak@xxxxxxx
+ * Copyright (c) 2005,2006 AMD
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Essentially a very, very stripped version of Keir Fraser's work in
+ * x86_emulate.c. Used for MMIO.
+ */
+
+/*
+ * TODO: The way in which we use hvm_instruction_length is very inefficient as
+ * it now stands. It will be worthwhile to return the actual instruction buffer
+ * along with the instruction length since one of the reasons we are getting
+ * the instruction length is to know how many instruction bytes we need to
+ * fetch.
+ */
+
+#include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/mm.h>
+#include <asm/regs.h>
+#include <asm-x86/x86_emulate.h>
+
+/* read from guest memory */
+extern int inst_copy_from_guest(unsigned char *buf, unsigned long eip,
+ int length);
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
+#define DstReg (2<<1) /* Register operand. */
+#define DstMem (3<<1) /* Memory operand. */
+#define DstMask (3<<1)
+/* Source operand type. */
+#define SrcNone (0<<3) /* No source operand. */
+#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
+#define SrcReg (1<<3) /* Register operand. */
+#define SrcMem (2<<3) /* Memory operand. */
+#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
+#define SrcImm (5<<3) /* Immediate operand. */
+#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
+#define SrcMask (7<<3)
+/* Generic ModRM decode. */
+#define ModRM (1<<6)
+/* Destination is only written; never read. */
+#define Mov (1<<7)
+
+static uint8_t opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x08 - 0x0F */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x10 - 0x17 */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x18 - 0x1F */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x20 - 0x27 */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x28 - 0x2F */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x30 - 0x37 */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x38 - 0x3F */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, 0,
+ /* 0x40 - 0x4F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 - 0x6F */
+ 0, 0, 0, DstReg|SrcMem32|ModRM|Mov /* movsxd (x86/64) */,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 - 0x87 */
+ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM,
+ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM,
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ /* 0x88 - 0x8F */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+ ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+ 0, 0, 0, DstMem|SrcNone|ModRM|Mov,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 - 0xA7 */
+ ByteOp|DstReg|SrcMem|Mov, DstReg|SrcMem|Mov,
+ ByteOp|DstMem|SrcReg|Mov, DstMem|SrcReg|Mov,
+ ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+ ByteOp|ImplicitOps, ImplicitOps,
+ /* 0xA8 - 0xAF */
+ 0, 0, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+ ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
+ ByteOp|ImplicitOps, ImplicitOps,
+ /* 0xB0 - 0xBF */
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xC0 - 0xC7 */
+ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, 0, 0,
+ 0, 0, ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM,
+ /* 0xC8 - 0xCF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xD7 */
+ ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+ ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+ 0, 0, 0, 0,
+ /* 0xD8 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 - 0xF7 */
+ 0, 0, 0, 0,
+ 0, 0, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM,
+ /* 0xF8 - 0xFF */
+ 0, 0, 0, 0,
+ 0, 0, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
+};
+
+static uint8_t twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
+ /* 0x10 - 0x1F */
+ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x20 - 0x2F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x30 - 0x3F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x40 - 0x47 */
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ /* 0x48 - 0x4F */
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
+ /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x60 - 0x6F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x80 - 0x8F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xA0 - 0xA7 */
+ 0, 0, 0, DstMem|SrcReg|ModRM, 0, 0, 0, 0,
+ /* 0xA8 - 0xAF */
+ 0, 0, 0, DstMem|SrcReg|ModRM, 0, 0, 0, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstMem|SrcReg|ModRM,
+ 0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
+ /* 0xB8 - 0xBF */
+ 0, 0, DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
+ 0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
+ /* 0xC0 - 0xCF */
+ 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xD0 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xF0 - 0xFF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/*
+ * insn_fetch - fetch the next 1 to 4 bytes from instruction stream
+ *
+ * @_type: u8, u16, u32, s8, s16, or s32
+ * @_size: 1, 2, or 4 bytes
+ * @_eip: address to fetch from guest memory
+ * @_length: increments the current instruction length counter by _size
+ *
+ * This is used internally by hvm_instruction_length to fetch the next byte,
+ * word, or dword from guest memory at location _eip. we currently use a local
+ * unsigned long as the storage buffer since the most bytes we're gonna get
+ * is limited to 4.
+ */
+#define insn_fetch(_type, _size, _eip, _length) \
+({ unsigned long _x; \
+ if ((rc = inst_copy_from_guest((unsigned char *)(&(_x)), \
+ (unsigned long)(_eip), _size)) \
+ != _size) \
+ goto done; \
+ (_eip) += (_size); \
+ (_length) += (_size); \
+ (_type)_x; \
+})
+
+/**
+ * hvm_instruction_length - returns the current instructions length
+ *
+ * @regs: guest register state
+ * @mode: guest operating mode
+ *
+ * EXTERNAL this routine calculates the length of the current instruction
+ * pointed to by eip. The guest state is _not_ changed by this routine.
+ */
+int hvm_instruction_length(struct cpu_user_regs *regs, int mode)
+{
+ uint8_t b, d, twobyte = 0, rex_prefix = 0;
+ uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
+ unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
+ int rc = 0;
+ int length = 0;
+ unsigned int tmp;
+
+ /* Shadow copy of register state. Committed on successful emulation. */
+ struct cpu_user_regs _regs = *regs;
+
+ /* include CS for 16-bit modes */
+ if (mode == X86EMUL_MODE_REAL || mode == X86EMUL_MODE_PROT16)
+ _regs.eip += (_regs.cs << 4);
+
+ switch ( mode )
+ {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_PROT16:
+ op_bytes = ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ op_bytes = ad_bytes = 4;
+ break;
+#ifdef __x86_64__
+ case X86EMUL_MODE_PROT64:
+ op_bytes = 4;
+ ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ /* Legacy prefixes. */
+ for ( i = 0; i < 8; i++ )
+ {
+ switch ( b = insn_fetch(uint8_t, 1, _regs.eip, length) )
+ {
+ case 0x66: /* operand-size override */
+ op_bytes ^= 6; /* switch between 2/4 bytes */
+ break;
+ case 0x67: /* address-size override */
+ if ( mode == X86EMUL_MODE_PROT64 )
+ ad_bytes ^= 12; /* switch between 4/8 bytes */
+ else
+ ad_bytes ^= 6; /* switch between 2/4 bytes */
+ break;
+ case 0x2e: /* CS override */
+ case 0x3e: /* DS override */
+ case 0x26: /* ES override */
+ case 0x64: /* FS override */
+ case 0x65: /* GS override */
+ case 0x36: /* SS override */
+ break;
+ case 0xf0: /* LOCK */
+ lock_prefix = 1;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ rep_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ break;
+ default:
+ goto done_prefixes;
+ }
+ }
+done_prefixes:
+
+ /* Note quite the same as 80386 real mode, but hopefully good enough. */
+ if ( (mode == X86EMUL_MODE_REAL) && (ad_bytes != 2) ) {
+ printf("sonofabitch!! we don't support 32-bit addresses in
realmode\n");
+ goto cannot_emulate;
+ }
+
+ /* REX prefix. */
+ if ( (mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40) )
+ {
+ rex_prefix = b;
+ if ( b & 8 )
+ op_bytes = 8; /* REX.W */
+ modrm_reg = (b & 4) << 1; /* REX.R */
+ /* REX.B and REX.X do not need to be decoded. */
+ b = insn_fetch(uint8_t, 1, _regs.eip, length);
+ }
+
+ /* Opcode byte(s). */
+ d = opcode_table[b];
+ if ( d == 0 )
+ {
+ /* Two-byte opcode? */
+ if ( b == 0x0f )
+ {
+ twobyte = 1;
+ b = insn_fetch(uint8_t, 1, _regs.eip, length);
+ d = twobyte_table[b];
+ }
+
+ /* Unrecognised? */
+ if ( d == 0 )
+ goto cannot_emulate;
+ }
+
+ /* ModRM and SIB bytes. */
+ if ( d & ModRM )
+ {
+ modrm = insn_fetch(uint8_t, 1, _regs.eip, length);
+ modrm_mod |= (modrm & 0xc0) >> 6;
+ modrm_reg |= (modrm & 0x38) >> 3;
+ modrm_rm |= (modrm & 0x07);
+
+ if ( modrm_mod == 3 )
+ {
+ DPRINTK("Cannot parse ModRM.mod == 3.\n");
+ goto cannot_emulate;
+ }
+
+ if ( ad_bytes == 2 )
+ {
+ /* 16-bit ModR/M decode. */
+ switch ( modrm_mod )
+ {
+ case 0:
+ if ( modrm_rm == 6 )
+ {
+ length += 2;
+ _regs.eip += 2; /* skip disp16 */
+ }
+ break;
+ case 1:
+ length += 1;
+ _regs.eip += 1; /* skip disp8 */
+ break;
+ case 2:
+ length += 2;
+ _regs.eip += 2; /* skip disp16 */
+ break;
+ }
+ }
+ else
+ {
+ /* 32/64-bit ModR/M decode. */
+ switch ( modrm_mod )
+ {
+ case 0:
+ if ( (modrm_rm == 4) &&
+ (((insn_fetch(uint8_t, 1, _regs.eip, length)) & 7)
+ == 5) )
+ {
+ length += 4;
+ _regs.eip += 4; /* skip disp32 specified by SIB.base */
+ }
+ else if ( modrm_rm == 5 )
+ {
+ length += 4;
+ _regs.eip += 4; /* skip disp32 */
+ }
+ break;
+ case 1:
+ if ( modrm_rm == 4 )
+ {
+ insn_fetch(uint8_t, 1, _regs.eip, length);
+ }
+ length += 1;
+ _regs.eip += 1; /* skip disp8 */
+ break;
+ case 2:
+ if ( modrm_rm == 4 )
+ {
+ insn_fetch(uint8_t, 1, _regs.eip, length);
+ }
+ length += 4;
+ _regs.eip += 4; /* skip disp32 */
+ break;
+ }
+ }
+ }
+
+ /* Decode and fetch the destination operand: register or memory. */
+ switch ( d & DstMask )
+ {
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ goto done;
+ }
+
+ /* Decode and fetch the source operand: register, memory or immediate. */
+ switch ( d & SrcMask )
+ {
+ case SrcImm:
+ tmp = (d & ByteOp) ? 1 : op_bytes;
+ if ( tmp == 8 ) tmp = 4;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch ( tmp )
+ {
+ case 1: insn_fetch(int8_t, 1, _regs.eip, length); break;
+ case 2: insn_fetch(int16_t, 2, _regs.eip, length); break;
+ case 4: insn_fetch(int32_t, 4, _regs.eip, length); break;
+ }
+ break;
+ case SrcImmByte:
+ insn_fetch(int8_t, 1, _regs.eip, length);
+ break;
+ }
+
+ if ( twobyte )
+ goto done;
+
+ switch ( b )
+ {
+ case 0xa0 ... 0xa1: /* mov */
+ length += ad_bytes;
+ _regs.eip += ad_bytes; /* skip src displacement */
+ break;
+ case 0xa2 ... 0xa3: /* mov */
+ length += ad_bytes;
+ _regs.eip += ad_bytes; /* skip dst displacement */
+ break;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ switch ( modrm_reg )
+ {
+ case 0 ... 1: /* test */
+ /* Special case in Grp3: test has an immediate source operand. */
+ tmp = (d & ByteOp) ? 1 : op_bytes;
+ if ( tmp == 8 ) tmp = 4;
+ switch ( tmp )
+ {
+ case 1: insn_fetch(int8_t, 1, _regs.eip, length); break;
+ case 2: insn_fetch(int16_t, 2, _regs.eip, length); break;
+ case 4: insn_fetch(int32_t, 4, _regs.eip, length); break;
+ }
+ goto done;
+ }
+ break;
+ }
+
+done:
+ return length;
+
+cannot_emulate:
+ DPRINTK("Cannot emulate %02x at address %lx (eip %lx, mode %d)\n",
+ b, (unsigned long)_regs.eip, (unsigned long)regs->eip, mode);
+ return -1;
+}
diff -r 9da2d9b48ff8 -r f34e37d0742d xen/arch/x86/hvm/svm/instrlen.c
--- a/xen/arch/x86/hvm/svm/instrlen.c Tue Sep 26 16:15:45 2006 -0600
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,479 +0,0 @@
-/*
- * instrlen.c - calculates the instruction length for all operating modes
- *
- * Travis Betak, travis.betak@xxxxxxx
- * Copyright (c) 2005,2006 AMD
- * Copyright (c) 2005 Keir Fraser
- *
- * Essentially a very, very stripped version of Keir Fraser's work in
- * x86_emulate.c. Used for MMIO.
- */
-
-/*
- * TODO: the way in which we use svm_instrlen is very inefficient as is now
- * stands. It will be worth while to return the actual instruction buffer
- * along with the instruction length since one of the reasons we are getting
- * the instruction length is to know how many instruction bytes we need to
- * fetch.
- */
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/lib.h>
-#include <xen/mm.h>
-#include <asm/regs.h>
-#define DPRINTF DPRINTK
-#include <asm-x86/x86_emulate.h>
-
-/* read from guest memory */
-extern int inst_copy_from_guest(unsigned char *buf, unsigned long eip,
- int length);
-extern void svm_dump_inst(unsigned long eip);
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstMask (3<<1)
-/* Source operand type. */
-#define SrcNone (0<<3) /* No source operand. */
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<3) /* Register operand. */
-#define SrcMem (2<<3) /* Memory operand. */
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
-#define SrcImm (5<<3) /* Immediate operand. */
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
-#define SrcMask (7<<3)
-/* Generic ModRM decode. */
-#define ModRM (1<<6)
-/* Destination is only written; never read. */
-#define Mov (1<<7)
-
-static uint8_t opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x08 - 0x0F */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x10 - 0x17 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x18 - 0x1F */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x20 - 0x27 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x40 - 0x4F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, DstReg|SrcMem32|ModRM|Mov /* movsxd (x86/64) */,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x87 */
- ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM,
- ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM,
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- /* 0x88 - 0x8F */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- 0, 0, 0, DstMem|SrcNone|ModRM|Mov,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp|DstReg|SrcMem|Mov, DstReg|SrcMem|Mov,
- ByteOp|DstMem|SrcReg|Mov, DstMem|SrcReg|Mov,
- ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
- ByteOp|ImplicitOps, ImplicitOps,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
- ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
- ByteOp|ImplicitOps, ImplicitOps,
- /* 0xB0 - 0xBF */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xC0 - 0xC7 */
- ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, 0, 0,
- 0, 0, ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM,
- /* 0xC8 - 0xCF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xD7 */
- ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
- ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- 0, 0, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM,
- /* 0xF8 - 0xFF */
- 0, 0, 0, 0,
- 0, 0, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
-};
-
-static uint8_t twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x48 - 0x4F */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem|SrcReg|ModRM, 0, 0, 0, 0,
- /* 0xA8 - 0xAF */
- 0, 0, 0, DstMem|SrcReg|ModRM, 0, 0, 0, 0,
- /* 0xB0 - 0xB7 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstMem|SrcReg|ModRM,
- 0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xB8 - 0xBF */
- 0, 0, DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
- 0, 0, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/*
- * insn_fetch - fetch the next 1 to 4 bytes from instruction stream
- *
- * @_type: u8, u16, u32, s8, s16, or s32
- * @_size: 1, 2, or 4 bytes
- * @_eip: address to fetch from guest memory
- * @_length: updated! increments the current instruction length counter by
_size
- *
- * INTERNAL this is used internally by svm_instrlen to fetch the next byte,
- * word, or dword from guest memory at location _eip. we currently use a local
- * unsigned long as the storage buffer since the most bytes we're gonna get
- * is limited to 4.
- */
-#define insn_fetch(_type, _size, _eip, _length) \
-({ unsigned long _x; \
- if ((rc = inst_copy_from_guest((unsigned char *)(&(_x)), \
- (unsigned long)(_eip), _size)) \
- != _size) \
- goto done; \
- (_eip) += (_size); \
- (_length) += (_size); \
- (_type)_x; \
-})
-
-
-/**
- * svn_instrlen - returns the current instructions length
- *
- * @regs: guest register state
- * @mode: guest operating mode
- *
- * EXTERNAL this routine calculates the length of the current instruction
- * pointed to by eip. The guest state is _not_ changed by this routine.
- */
-int svm_instrlen(struct cpu_user_regs *regs, int mode)
-{
- uint8_t b, d, twobyte = 0, rex_prefix = 0;
- uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
- unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
- int rc = 0;
- int length = 0;
- unsigned int tmp;
-
- /* Shadow copy of register state. Committed on successful emulation. */
- struct cpu_user_regs _regs = *regs;
-
- /* include CS for 16-bit modes */
- if (mode == X86EMUL_MODE_REAL || mode == X86EMUL_MODE_PROT16)
- _regs.eip += (_regs.cs << 4);
-
- switch ( mode )
- {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_PROT16:
- op_bytes = ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- op_bytes = ad_bytes = 4;
- break;
-#ifdef __x86_64__
- case X86EMUL_MODE_PROT64:
- op_bytes = 4;
- ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- /* Legacy prefixes. */
- for ( i = 0; i < 8; i++ )
- {
- switch ( b = insn_fetch(uint8_t, 1, _regs.eip, length) )
- {
- case 0x66: /* operand-size override */
- op_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x67: /* address-size override */
- if ( mode == X86EMUL_MODE_PROT64 )
- ad_bytes ^= 12; /* switch between 4/8 bytes */
- else
- ad_bytes ^= 6; /* switch between 2/4 bytes */
- break;
- case 0x2e: /* CS override */
- case 0x3e: /* DS override */
- case 0x26: /* ES override */
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- case 0x36: /* SS override */
- break;
- case 0xf0: /* LOCK */
- lock_prefix = 1;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- rep_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- break;
- default:
- goto done_prefixes;
- }
- }
-done_prefixes:
-
- /* Note quite the same as 80386 real mode, but hopefully good enough. */
- if ( (mode == X86EMUL_MODE_REAL) && (ad_bytes != 2) ) {
- printf("sonofabitch!! we don't support 32-bit addresses in
realmode\n");
- goto cannot_emulate;
- }
-
- /* REX prefix. */
- if ( (mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40) )
- {
- rex_prefix = b;
- if ( b & 8 )
- op_bytes = 8; /* REX.W */
- modrm_reg = (b & 4) << 1; /* REX.R */
- /* REX.B and REX.X do not need to be decoded. */
- b = insn_fetch(uint8_t, 1, _regs.eip, length);
- }
-
- /* Opcode byte(s). */
- d = opcode_table[b];
- if ( d == 0 )
- {
- /* Two-byte opcode? */
- if ( b == 0x0f )
- {
- twobyte = 1;
- b = insn_fetch(uint8_t, 1, _regs.eip, length);
- d = twobyte_table[b];
- }
-
- /* Unrecognised? */
- if ( d == 0 )
- goto cannot_emulate;
- }
-
- /* ModRM and SIB bytes. */
- if ( d & ModRM )
- {
- modrm = insn_fetch(uint8_t, 1, _regs.eip, length);
- modrm_mod |= (modrm & 0xc0) >> 6;
- modrm_reg |= (modrm & 0x38) >> 3;
- modrm_rm |= (modrm & 0x07);
-
- if ( modrm_mod == 3 )
- {
- DPRINTF("Cannot parse ModRM.mod == 3.\n");
- goto cannot_emulate;
- }
-
- if ( ad_bytes == 2 )
- {
- /* 16-bit ModR/M decode. */
- switch ( modrm_mod )
- {
- case 0:
- if ( modrm_rm == 6 )
- {
- length += 2;
- _regs.eip += 2; /* skip disp16 */
- }
- break;
- case 1:
- length += 1;
- _regs.eip += 1; /* skip disp8 */
- break;
- case 2:
- length += 2;
- _regs.eip += 2; /* skip disp16 */
- break;
- }
- }
- else
- {
- /* 32/64-bit ModR/M decode. */
- switch ( modrm_mod )
- {
- case 0:
- if ( (modrm_rm == 4) &&
- (((insn_fetch(uint8_t, 1, _regs.eip, length)) & 7)
- == 5) )
- {
- length += 4;
- _regs.eip += 4; /* skip disp32 specified by SIB.base */
- }
- else if ( modrm_rm == 5 )
- {
- length += 4;
- _regs.eip += 4; /* skip disp32 */
- }
- break;
- case 1:
- if ( modrm_rm == 4 )
- {
- insn_fetch(uint8_t, 1, _regs.eip, length);
- }
- length += 1;
- _regs.eip += 1; /* skip disp8 */
- break;
- case 2:
- if ( modrm_rm == 4 )
- {
- insn_fetch(uint8_t, 1, _regs.eip, length);
- }
- length += 4;
- _regs.eip += 4; /* skip disp32 */
- break;
- }
- }
- }
-
- /* Decode and fetch the destination operand: register or memory. */
- switch ( d & DstMask )
- {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- goto done;
- }
-
- /* Decode and fetch the source operand: register, memory or immediate. */
- switch ( d & SrcMask )
- {
- case SrcImm:
- tmp = (d & ByteOp) ? 1 : op_bytes;
- if ( tmp == 8 ) tmp = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch ( tmp )
- {
- case 1: insn_fetch(int8_t, 1, _regs.eip, length); break;
- case 2: insn_fetch(int16_t, 2, _regs.eip, length); break;
- case 4: insn_fetch(int32_t, 4, _regs.eip, length); break;
- }
- break;
- case SrcImmByte:
- insn_fetch(int8_t, 1, _regs.eip, length);
- break;
- }
-
- if ( twobyte )
- goto done;
-
- switch ( b )
- {
- case 0xa0 ... 0xa1: /* mov */
- length += ad_bytes;
- _regs.eip += ad_bytes; /* skip src displacement */
- break;
- case 0xa2 ... 0xa3: /* mov */
- length += ad_bytes;
- _regs.eip += ad_bytes; /* skip dst displacement */
- break;
- case 0xf6 ... 0xf7: /* Grp3 */
- switch ( modrm_reg )
- {
- case 0 ... 1: /* test */
- /* Special case in Grp3: test has an immediate source operand. */
- tmp = (d & ByteOp) ? 1 : op_bytes;
- if ( tmp == 8 ) tmp = 4;
- switch ( tmp )
- {
- case 1: insn_fetch(int8_t, 1, _regs.eip, length); break;
- case 2: insn_fetch(int16_t, 2, _regs.eip, length); break;
- case 4: insn_fetch(int32_t, 4, _regs.eip, length); break;
- }
- goto done;
- }
- break;
- }
-
-done:
- return length;
-
-cannot_emulate:
- DPRINTF("Cannot emulate %02x at address %lx (eip %lx, mode %d)\n",
- b, (unsigned long)_regs.eip, (unsigned long)regs->eip, mode);
- svm_dump_inst(_regs.eip);
- return -1;
-}
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|