# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1259157928 0
# Node ID d53db6af369f6ba8f89aa038999eccaf10c0416e
# Parent 12644e457c3747f6174fe97e25ce6764c2cf0fd1
Replace tsc_native config option with tsc_mode config option
(NOTE: pvrdtscp mode not finished yet, but all other
modes have been tested so sooner seemed better than
later to submit this fairly major patch so we can get
more mileage on it before next release.)
New tsc_mode config option supercedes tsc_native and
offers a more intelligent default and an additional
option for intelligent apps running on PV domains
("pvrdtscp").
For PV domains, default mode will determine if the initial
host has a "safe"** TSC (meaning it is always synchronized
across all physical CPUs). If so, all domains will
execute all rdtsc instructions natively; if not,
all domains will emulate all rdtsc instructions but
providing the TSC hertz rate of the initial machine.
After being restored or live-migrated, all PV domains will
emulate all rdtsc instructions. Hence, this default mode
guarantees correctness while providing native performance
in most conditions.
For PV domains, tsc_mode==1 will always emulate rdtsc
and tsc_mode==2 will never emulate rdtsc. For tsc_mode==3,
rdtsc will never be emulated, but information is provided
through pvcpuid instructions and rdtscp instructions
so that an app can obtain "safe" pvclock-like TSC information
across save/restore and live migration. (Will be completed in
a follow-on patch.)
For HVM domains, the default mode and "always emulate"
mode do the same as tsc_native==0; the other two modes
do the same as tsc_native==1. (HVM domains since 3.4
have implemented a tsc_mode=default-like functionality,
but also can preserve native TSC across save/restore
and live-migration IFF the initial and target machines
have a common TSC cycle rate.)
** All newer AMD machines, and Nehalem and future Intel
machines have "Invariant TSC"; many newer Intel machines
have "Constant TSC" and do not support deep-C sleep states;
these and all single-processor machines are "safe".
Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
---
tools/examples/xmexample.hvm | 11 -
tools/libxc/xc_domain.c | 55 +++++-
tools/libxc/xc_domain_restore.c | 12 +
tools/libxc/xc_domain_save.c | 31 +++
tools/libxc/xenctrl.h | 14 +
tools/python/xen/lowlevel/xc/xc.c | 27 +--
tools/python/xen/xend/XendConfig.py | 6
tools/python/xen/xend/XendDomainInfo.py | 6
tools/python/xen/xm/create.py | 10 -
tools/python/xen/xm/xenapi_create.py | 2
xen/arch/x86/domain.c | 2
xen/arch/x86/domctl.c | 38 +++-
xen/arch/x86/hvm/hvm.c | 2
xen/arch/x86/time.c | 283 ++++++++++++++++++++++++++++----
xen/arch/x86/traps.c | 15 +
xen/include/asm-x86/domain.h | 18 +-
xen/include/asm-x86/msr.h | 2
xen/include/asm-x86/processor.h | 4
xen/include/asm-x86/time.h | 28 +++
xen/include/public/domctl.h | 23 +-
20 files changed, 498 insertions(+), 91 deletions(-)
diff -r 12644e457c37 -r d53db6af369f tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/examples/xmexample.hvm Wed Nov 25 14:05:28 2009 +0000
@@ -178,11 +178,16 @@ serial='pty'
serial='pty'
#----------------------------------------------------------------------------
-# tsc_native : TSC mode (0=emulate TSC, 1=native TSC)
+# tsc_mode : TSC mode (0=default, 1=native TSC, 2=never emulate, 3=pvrdtscp)
# emulate TSC provides synced TSC for all vcpus, but lose perfomrance.
# native TSC leverages hardware's TSC(no perf loss), but vcpu's TSC may lose
-# sync due to hardware's unreliable/unsynced TSC between CPUs.
-tsc_native=1
+# sync due to hardware's unreliable/unsynced TSC between CPUs.
+# default intelligently uses native TSC on machines where it is safe, but
+# switches to emulated if necessary after save/restore/migration
+# pvrdtscp is for intelligent apps that use special Xen-only paravirtualized
+# cpuid instructions to obtain offset/scaling/migration info and maximize
+# performance within pools of machines that support the rdtscp instruction
+tsc_mode=0
#-----------------------------------------------------------------------------
# Qemu Monitor, default is disable
diff -r 12644e457c37 -r d53db6af369f tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/libxc/xc_domain.c Wed Nov 25 14:05:28 2009 +0000
@@ -466,15 +466,6 @@ int xc_domain_set_time_offset(int xc_han
return do_domctl(xc_handle, &domctl);
}
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native)
-{
- DECLARE_DOMCTL;
- domctl.cmd = XEN_DOMCTL_set_tsc_native;
- domctl.domain = (domid_t)domid;
- domctl.u.set_tsc_native.is_native = is_native;
- return do_domctl(xc_handle, &domctl);
-}
-
int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
{
DECLARE_DOMCTL;
@@ -483,6 +474,52 @@ int xc_domain_disable_migrate(int xc_han
domctl.u.disable_migrate.disable = 1;
return do_domctl(xc_handle, &domctl);
}
+
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation)
+{
+ DECLARE_DOMCTL;
+ domctl.cmd = XEN_DOMCTL_settscinfo;
+ domctl.domain = (domid_t)domid;
+ domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+ domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+ domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+ domctl.u.tsc_info.info.incarnation = incarnation;
+ return do_domctl(xc_handle, &domctl);
+}
+
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ int rc;
+ DECLARE_DOMCTL;
+ xen_guest_tsc_info_t info = { 0 };
+
+ domctl.cmd = XEN_DOMCTL_gettscinfo;
+ domctl.domain = (domid_t)domid;
+ set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+ if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+ return rc;
+ rc = do_domctl(xc_handle, &domctl);
+ if ( rc == 0 )
+ {
+ *tsc_mode = info.tsc_mode;
+ *elapsed_nsec = info.elapsed_nsec;
+ *gtsc_khz = info.gtsc_khz;
+ *incarnation = info.incarnation;
+ }
+ unlock_pages(&info,sizeof(info));
+ return rc;
+}
+
int xc_domain_memory_increase_reservation(int xc_handle,
uint32_t domid,
diff -r 12644e457c37 -r d53db6af369f tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c Wed Nov 25 14:05:28 2009 +0000
@@ -1084,6 +1084,18 @@ static int pagebuf_get_one(pagebuf_t* bu
return -1;
}
return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( count == -7 ) {
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+ if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+ read_exact(fd, &nsec, sizeof(uint64_t)) ||
+ read_exact(fd, &khz, sizeof(uint32_t)) ||
+ read_exact(fd, &incarn, sizeof(uint32_t)) ||
+ xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+ ERROR("error reading/restoring tsc info");
+ return -1;
+ }
+ return pagebuf_get_one(buf, fd, xch, dom);
} else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
ERROR("Max batch size exceeded (%d). Giving up.", count);
return -1;
diff -r 12644e457c37 -r d53db6af369f tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/libxc/xc_domain_save.c Wed Nov 25 14:05:28 2009 +0000
@@ -841,6 +841,24 @@ static xen_pfn_t *map_and_save_p2m_table
return success ? p2m : NULL;
}
+/* must be done AFTER suspend_and_state() */
+static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
+{
+ int marker = -7;
+ uint32_t tsc_mode, khz, incarn;
+ uint64_t nsec;
+
+ if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+ &nsec, &khz, &incarn) < 0 ||
+ write_exact(io_fd, &marker, sizeof(marker)) ||
+ write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+ write_exact(io_fd, &nsec, sizeof(nsec)) ||
+ write_exact(io_fd, &khz, sizeof(khz)) ||
+ write_exact(io_fd, &incarn, sizeof(incarn)) )
+ return -1;
+ return 0;
+}
+
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags,
struct save_callbacks* callbacks,
@@ -1097,6 +1115,12 @@ int xc_domain_save(int xc_handle, int io
if ( tmem_saved == -1 )
{
ERROR("Error when writing to state file (tmem)");
+ goto out;
+ }
+
+ if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
goto out;
}
@@ -1458,6 +1482,13 @@ int xc_domain_save(int xc_handle, int io
goto out;
}
+ if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
+ {
+ ERROR("Error when writing to state file (tsc)");
+ goto out;
+ }
+
+
}
if ( xc_shadow_control(xc_handle, dom,
diff -r 12644e457c37 -r d53db6af369f tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/libxc/xenctrl.h Wed Nov 25 14:05:28 2009 +0000
@@ -628,7 +628,19 @@ int xc_domain_set_time_offset(int xc_han
uint32_t domid,
int32_t time_offset_seconds);
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
+int xc_domain_set_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t tsc_mode,
+ uint64_t elapsed_nsec,
+ uint32_t gtsc_khz,
+ uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+ uint32_t domid,
+ uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz,
+ uint32_t *incarnation);
int xc_domain_disable_migrate(int xc_handle, uint32_t domid);
diff -r 12644e457c37 -r d53db6af369f tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Nov 25 14:05:28 2009 +0000
@@ -1486,14 +1486,14 @@ static PyObject *pyxc_domain_set_time_of
return zero;
}
-static PyObject *pyxc_domain_set_tsc_native(XcObject *self, PyObject *args)
-{
- uint32_t dom, is_native;
-
- if (!PyArg_ParseTuple(args, "ii", &dom, &is_native))
- return NULL;
-
- if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+static PyObject *pyxc_domain_set_tsc_info(XcObject *self, PyObject *args)
+{
+ uint32_t dom, tsc_mode;
+
+ if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
+ return NULL;
+
+ if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
return pyxc_error_to_exception();
Py_INCREF(zero);
@@ -2036,12 +2036,13 @@ static PyMethodDef pyxc_methods[] = {
" offset [int]: Time offset from UTC in seconds.\n"
"Returns: [int] 0 on success; -1 on error.\n" },
- { "domain_set_tsc_native",
- (PyCFunction)pyxc_domain_set_tsc_native,
- METH_VARARGS, "\n"
- "Set a domain's TSC mode (emulate vs native)\n"
+ { "domain_set_tsc_info",
+ (PyCFunction)pyxc_domain_set_tsc_info,
+ METH_VARARGS, "\n"
+ "Set a domain's TSC mode\n"
" dom [int]: Domain whose TSC mode is being set.\n"
- " is_native [int]: 1=native, 0=emulate.\n"
+ " tsc_mode [int]: 0=default (monotonic, but native where possible)\n"
+ " 1=always emulate 2=never emulate 3=pvrdtscp\n"
"Returns: [int] 0 on success; -1 on error.\n" },
{ "domain_disable_migrate",
diff -r 12644e457c37 -r d53db6af369f tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/python/xen/xend/XendConfig.py Wed Nov 25 14:05:28 2009 +0000
@@ -163,7 +163,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
'vncdisplay': int,
'vnclisten': str,
'timer_mode': int,
- 'tsc_native': int,
+ 'tsc_mode': int,
'vpt_align': int,
'viridian': int,
'vncpasswd': str,
@@ -477,8 +477,8 @@ class XendConfig(dict):
if not os.path.exists(self['platform']['device_model']):
raise VmError("device model '%s' not found" %
str(self['platform']['device_model']))
- if 'tsc_native' not in self['platform']:
- self['platform']['tsc_native'] = 0
+ if 'tsc_mode' not in self['platform']:
+ self['platform']['tsc_mode'] = 0
if 'nomigrate' not in self['platform']:
self['platform']['nomigrate'] = 0
diff -r 12644e457c37 -r d53db6af369f tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py Wed Nov 25 14:05:28 2009 +0000
@@ -2468,9 +2468,9 @@ class XendDomainInfo:
self._recreateDom()
# Set TSC mode of domain
- tsc_native = self.info["platform"].get("tsc_native")
- if arch.type == "x86" and tsc_native is not None:
- xc.domain_set_tsc_native(self.domid, int(tsc_native))
+ tsc_mode = self.info["platform"].get("tsc_mode")
+ if arch.type == "x86" and tsc_mode is not None:
+ xc.domain_set_tsc_info(self.domid, int(tsc_mode))
# Set timer configuration of domain
timer_mode = self.info["platform"].get("timer_mode")
diff -r 12644e457c37 -r d53db6af369f tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/python/xen/xm/create.py Wed Nov 25 14:05:28 2009 +0000
@@ -221,9 +221,9 @@ gopts.var('timer_mode', val='TIMER_MODE'
use="""Timer mode (0=delay virtual time when ticks are missed;
1=virtual time is always wallclock time.""")
-gopts.var('tsc_native', val='TSC_NATIVE',
+gopts.var('tsc_mode', val='TSC_MODE',
fn=set_int, default=0,
- use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+ use="""TSC mode (0=default, 1=always emulate, 2=never emulate,
3=pvrdtscp).""")
gopts.var('nomigrate', val='NOMIGRATE',
fn=set_int, default=0,
@@ -738,8 +738,8 @@ def configure_image(vals):
if vals.suppress_spurious_page_faults:
config_image.append(['suppress_spurious_page_faults',
vals.suppress_spurious_page_faults])
- if vals.tsc_native is not None:
- config_image.append(['tsc_native', vals.tsc_native])
+ if vals.tsc_mode is not None:
+ config_image.append(['tsc_mode', vals.tsc_mode])
if vals.nomigrate is not None:
config_image.append(['nomigrate', vals.nomigrate])
@@ -1036,7 +1036,7 @@ def make_config(vals):
config.append([n, v])
map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
- 'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+ 'restart', 'on_poweroff', 'tsc_mode', 'nomigrate',
'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
'cpuid_check', 'machine_address_size',
'suppress_spurious_page_faults'])
diff -r 12644e457c37 -r d53db6af369f tools/python/xen/xm/xenapi_create.py
--- a/tools/python/xen/xm/xenapi_create.py Wed Nov 25 14:04:46 2009 +0000
+++ b/tools/python/xen/xm/xenapi_create.py Wed Nov 25 14:05:28 2009 +0000
@@ -1108,7 +1108,7 @@ class sxp2xml:
'pci_msitranslate',
'pci_power_mgmt',
'xen_platform_pci',
- 'tsc_native'
+ 'tsc_mode'
'description',
'nomigrate'
]
diff -r 12644e457c37 -r d53db6af369f xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/arch/x86/domain.c Wed Nov 25 14:05:28 2009 +0000
@@ -520,6 +520,8 @@ int arch_domain_create(struct domain *d,
d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
}
+ /* initialize default tsc behavior in case tools don't */
+ tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
spin_lock_init(&d->arch.vtsc_lock);
return 0;
diff -r 12644e457c37 -r d53db6af369f xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/arch/x86/domctl.c Wed Nov 25 14:05:28 2009 +0000
@@ -1101,9 +1101,10 @@ long arch_do_domctl(
}
break;
- case XEN_DOMCTL_set_tsc_native:
- {
- struct domain *d;
+ case XEN_DOMCTL_gettscinfo:
+ {
+ struct domain *d;
+ xen_guest_tsc_info_t info;
ret = -ESRCH;
d = rcu_lock_domain_by_id(domctl->domain);
@@ -1111,9 +1112,34 @@ long arch_do_domctl(
break;
domain_pause(d);
- d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
- if ( is_hvm_domain(d) )
- hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+ tsc_get_info(d, &info.tsc_mode,
+ &info.elapsed_nsec,
+ &info.gtsc_khz,
+ &info.incarnation);
+ if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+ ret = -EFAULT;
+ else
+ ret = 0;
+ domain_unpause(d);
+
+ rcu_unlock_domain(d);
+ }
+ break;
+
+ case XEN_DOMCTL_settscinfo:
+ {
+ struct domain *d;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+
+ domain_pause(d);
+ tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+ domctl->u.tsc_info.info.elapsed_nsec,
+ domctl->u.tsc_info.info.gtsc_khz,
+ domctl->u.tsc_info.info.incarnation);
domain_unpause(d);
rcu_unlock_domain(d);
diff -r 12644e457c37 -r d53db6af369f xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/arch/x86/hvm/hvm.c Wed Nov 25 14:05:28 2009 +0000
@@ -1831,7 +1831,7 @@ void hvm_cpuid(unsigned int input, unsig
if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
return;
- if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
+ if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) )
return;
domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
diff -r 12644e457c37 -r d53db6af369f xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/arch/x86/time.c Wed Nov 25 14:05:28 2009 +0000
@@ -34,6 +34,7 @@
#include <asm/hpet.h>
#include <io_ports.h>
#include <asm/setup.h> /* for early_time_init */
+#include <public/arch-x86/cpuid.h>
/* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
static char __initdata opt_clocksource[10];
@@ -45,10 +46,12 @@ static u32 wc_sec, wc_nsec; /* UTC time
static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
static DEFINE_SPINLOCK(wc_lock);
+/* moved to <asm/domain.h>
struct time_scale {
int shift;
u32 mul_frac;
};
+*/
struct cpu_time {
u64 local_tsc_stamp;
@@ -150,13 +153,32 @@ static inline u64 scale_delta(u64 delta,
return product;
}
+#define _TS_SHIFT_IDENTITY 1
+#define _TS_MUL_FRAC_IDENTITY 0x80000000UL
+#define _TS_IDENTITY { _TS_SHIFT_IDENTITY, _TS_MUL_FRAC_IDENTITY }
+static inline int time_scale_is_identity(struct time_scale *ts)
+{
+ if ( ts->shift != _TS_SHIFT_IDENTITY )
+ return 0;
+ else if ( ts->mul_frac != _TS_MUL_FRAC_IDENTITY )
+ return 0;
+ return 1;
+}
+
+static inline void set_time_scale_identity(struct time_scale *ts)
+{
+ ts->shift = _TS_SHIFT_IDENTITY;
+ ts->mul_frac = _TS_MUL_FRAC_IDENTITY;
+}
+
/* Compute the reciprocal of the given time_scale. */
static inline struct time_scale scale_reciprocal(struct time_scale scale)
{
struct time_scale reciprocal;
u32 dividend;
- dividend = 0x80000000u;
+ ASSERT(scale.mul_frac != 0);
+ dividend = _TS_MUL_FRAC_IDENTITY;
reciprocal.shift = 1 - scale.shift;
while ( unlikely(dividend >= scale.mul_frac) )
{
@@ -818,6 +840,8 @@ static void __update_vcpu_system_time(st
struct cpu_time *t;
struct vcpu_time_info *u, _u;
XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+ struct domain *d = v->domain;
+ s_time_t tsc_stamp = 0;
if ( v->vcpu_info == NULL )
return;
@@ -825,20 +849,31 @@ static void __update_vcpu_system_time(st
t = &this_cpu(cpu_time);
u = &vcpu_info(v, time);
+ if ( d->arch.vtsc )
+ {
+ tsc_stamp = t->stime_local_stamp - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ tsc_stamp = scale_delta(tsc_stamp, &d->arch.ns_to_vtsc);
+ }
+ else
+ tsc_stamp = t->local_tsc_stamp;
+
+ if ( d->arch.tsc_mode == TSC_MODE_PVRDTSCP &&
+ boot_cpu_has(X86_FEATURE_RDTSCP) )
+ write_rdtscp_aux(d->arch.incarnation);
+
/* Don't bother unless timestamps have changed or we are forced. */
- if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
- ? t->stime_local_stamp
- : t->local_tsc_stamp)) )
+ if ( !force && (u->tsc_timestamp == tsc_stamp) )
return;
memset(&_u, 0, sizeof(_u));
- if ( v->domain->arch.vtsc )
- {
- _u.tsc_timestamp = t->stime_local_stamp;
+ if ( d->arch.vtsc )
+ {
+ _u.tsc_timestamp = tsc_stamp;
_u.system_time = t->stime_local_stamp;
- _u.tsc_to_system_mul = 0x80000000u;
- _u.tsc_shift = 1;
+ _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+ _u.tsc_shift = d->arch.vtsc_to_ns.shift;
}
else
{
@@ -1556,7 +1591,7 @@ static void tsc_check_slave(void *unused
local_irq_enable();
}
-static void tsc_check_reliability(void)
+void tsc_check_reliability(void)
{
unsigned int cpu = smp_processor_id();
static DEFINE_SPINLOCK(lock);
@@ -1583,57 +1618,245 @@ void pv_soft_rdtsc(struct vcpu *v, struc
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
{
s_time_t now = get_s_time();
-
- spin_lock(&v->domain->arch.vtsc_lock);
+ struct domain *d = v->domain;
+
+ spin_lock(&d->arch.vtsc_lock);
if ( guest_kernel_mode(v, regs) )
- v->domain->arch.vtsc_kerncount++;
+ d->arch.vtsc_kerncount++;
else
- v->domain->arch.vtsc_usercount++;
-
- if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
- v->domain->arch.vtsc_last = now;
+ d->arch.vtsc_usercount++;
+
+ if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
+ d->arch.vtsc_last = now;
else
- now = ++v->domain->arch.vtsc_last;
-
- spin_unlock(&v->domain->arch.vtsc_lock);
+ now = ++d->arch.vtsc_last;
+
+ spin_unlock(&d->arch.vtsc_lock);
+
+ now = now - d->arch.vtsc_offset;
+ if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+ now = scale_delta(now, &d->arch.ns_to_vtsc);
regs->eax = (uint32_t)now;
regs->edx = (uint32_t)(now >> 32);
}
+static int host_tsc_is_safe(void)
+{
+ extern unsigned int max_cstate;
+
+ if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+ return 1;
+ if ( num_online_cpus() == 1 )
+ return 1;
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+ {
+ if ( !tsc_check_count )
+ tsc_check_reliability();
+ if ( tsc_max_warp == 0 )
+ return 1;
+ }
+ return 0;
+}
+
+void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx)
+{
+ struct domain *d = current->domain;
+ struct cpu_time *t;
+
+ t = &this_cpu(cpu_time);
+ switch ( sub_idx )
+ {
+ case 0: /* features */
+ *eax = ( ( (!!d->arch.vtsc) << 0 ) |
+ ( (!!host_tsc_is_safe()) << 1 ) |
+ ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
+ 0 );
+ *ebx = d->arch.tsc_mode;
+ *ecx = d->arch.tsc_khz;
+ *edx = d->arch.incarnation;
+ break;
+ case 1: /* pvclock group1 */ /* FIXME are these right? */
+ *eax = (uint32_t)t->local_tsc_stamp;
+ *ebx = (uint32_t)(t->local_tsc_stamp >> 32);
+ *ecx = t->tsc_scale.mul_frac;
+ *edx = d->arch.incarnation;
+ break;
+ case 2: /* pvclock scaling values */ /* FIXME are these right? */
+ *eax = (uint32_t)t->stime_local_stamp;
+ *ebx = (uint32_t)(t->stime_local_stamp >> 32);
+ *ecx = t->tsc_scale.shift;
+ *edx = d->arch.incarnation;
+ case 3: /* physical cpu_khz */
+ *eax = cpu_khz;
+ *ebx = *ecx = 0;
+ *edx = d->arch.incarnation;
+ break;
+ }
+}
+
+/*
+ * called to collect tsc-related data only for save file or live
+ * migrate; called after last rdtsc is done on this incarnation
+ */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
+ uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
+ uint32_t *incarnation)
+{
+ *incarnation = d->arch.incarnation;
+ switch ( *tsc_mode = d->arch.tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ *elapsed_nsec = *gtsc_khz = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = 1000000UL;
+ break;
+ case TSC_MODE_DEFAULT:
+ if ( d->arch.vtsc )
+ {
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+ *gtsc_khz = d->arch.tsc_khz;
+ } else {
+ uint64_t tsc = 0;
+ rdtscll(tsc);
+ *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
+ *gtsc_khz = cpu_khz;
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+ *gtsc_khz = d->arch.tsc_khz;
+ break;
+ }
+}
+
+/*
+ * This may be called as many as three times for a domain, once when the
+ * hypervisor creates the domain, once when the toolstack creates the
+ * domain and, if restoring/migrating, once when saved/migrated values
+ * are restored. Care must be taken that, if multiple calls occur,
+ * only the last "sticks" and all are completed before the guest executes
+ * an rdtsc instruction
+ */
+void tsc_set_info(struct domain *d,
+ uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation)
+{
+ if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
+ {
+ d->arch.vtsc = 0;
+ return;
+ }
+ switch ( d->arch.tsc_mode = tsc_mode )
+ {
+ case TSC_MODE_NEVER_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: never emulating TSC\n",__func__)
+ d->arch.vtsc = 0;
+ break;
+ case TSC_MODE_ALWAYS_EMULATE:
+ gdprintk(XENLOG_G_INFO, "%s: always emulating TSC\n",__func__)
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ break;
+ case TSC_MODE_DEFAULT:
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ if ( (host_tsc_is_safe() && incarnation == 0) || !d->domain_id )
+ {
+ gdprintk(XENLOG_G_INFO, "%s: using safe native TSC\n",__func__)
+ /* use native TSC if initial host supports it */
+ d->arch.vtsc = 0;
+ d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ } else if ( gtsc_khz != 0 && gtsc_khz != 1000000UL ) {
+ gdprintk(XENLOG_G_INFO, "%s: safe native TSC on initial host,"
+ "but now using emulation\n",__func__)
+ /* was native on initial host, now emulated at initial tsc hz*/
+ d->arch.vtsc = 1;
+ d->arch.tsc_khz = gtsc_khz;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ d->arch.ns_to_vtsc =
+ scale_reciprocal(d->arch.vtsc_to_ns);
+ } else {
+ gdprintk(XENLOG_G_INFO, "%s: unsafe TSC on initial host,"
+ "using emulation\n",__func__)
+ d->arch.vtsc = 1;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ set_time_scale_identity(&d->arch.ns_to_vtsc);
+ }
+ break;
+ case TSC_MODE_PVRDTSCP:
+ gdprintk(XENLOG_G_INFO, "%s: using PVRDTSCP\n",__func__)
+ if ( boot_cpu_has(X86_FEATURE_RDTSCP) && gtsc_khz != 0 ) {
+ d->arch.vtsc = 0;
+ set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+ } else {
+ d->arch.vtsc = 1;
+ d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+ set_time_scale_identity(&d->arch.vtsc_to_ns);
+ }
+ break;
+ }
+ d->arch.incarnation = incarnation + 1;
+ if ( is_hvm_domain(d) )
+ hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+}
+
/* vtsc may incur measurable performance degradation, diagnose with this */
static void dump_softtsc(unsigned char key)
{
struct domain *d;
int domcnt = 0;
+ extern unsigned int max_cstate;
tsc_check_reliability();
if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
printk("TSC marked as reliable, "
"warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
- printk("TSC marked as constant but not reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
- else
+ {
+ printk("TSC has constant rate, ");
+ if (max_cstate <= 2 && tsc_max_warp == 0)
+ printk("no deep Cstates, passed warp test, deemed reliable, ");
+ else
+ printk("deep Cstates possible, so not reliable, ");
+ printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ } else
printk("TSC not marked as either constant or reliable, "
- "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+ "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
for_each_domain ( d )
{
+ if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+ continue;
+ printk("dom%u%s: mode=%d",d->domain_id,
+ is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+ if ( d->arch.vtsc_offset )
+ printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+ if ( d->arch.tsc_khz )
+ printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+ if ( d->arch.incarnation )
+ printk(",inc=%"PRIu32"",d->arch.incarnation);
if ( !d->arch.vtsc )
+ {
+ printk("\n");
continue;
+ }
if ( is_hvm_domain(d) )
- printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
- d->domain_id, d->arch.vtsc_kerncount);
+ printk(",vtsc count: %"PRIu64" total\n",
+ d->arch.vtsc_kerncount);
else
- printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
- d->domain_id, d->arch.vtsc_kerncount,
- d->arch.vtsc_usercount);
+ printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+ d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
domcnt++;
}
if ( !domcnt )
- printk("All domains have native TSC\n");
+ printk("No domains have emulated TSC\n");
}
static struct keyhandler dump_softtsc_keyhandler = {
diff -r 12644e457c37 -r d53db6af369f xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/arch/x86/traps.c Wed Nov 25 14:05:28 2009 +0000
@@ -679,8 +679,8 @@ int wrmsr_hypervisor_regs(uint32_t idx,
return 1;
}
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
struct domain *d = current->domain;
/* Optionally shift out of the way of Viridian architectural leaves. */
@@ -693,7 +693,7 @@ int cpuid_hypervisor_leaves(
switch ( idx )
{
case 0:
- *eax = base + 2; /* Largest leaf */
+ *eax = base + 3; /* Largest leaf */
*ebx = XEN_CPUID_SIGNATURE_EBX;
*ecx = XEN_CPUID_SIGNATURE_ECX;
*edx = XEN_CPUID_SIGNATURE_EDX;
@@ -717,6 +717,11 @@ int cpuid_hypervisor_leaves(
*ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
break;
+ case 3:
+ *eax = *ebx = *ecx = *edx = 0;
+ cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
+ break;
+
default:
BUG();
}
@@ -735,7 +740,7 @@ static void pv_cpuid(struct cpu_user_reg
if ( current->domain->domain_id != 0 )
{
- if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
+ if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
goto out;
}
@@ -815,7 +820,7 @@ static void pv_cpuid(struct cpu_user_reg
a = b = c = d = 0;
break;
default:
- (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
+ (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
break;
}
diff -r 12644e457c37 -r d53db6af369f xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/include/asm-x86/domain.h Wed Nov 25 14:05:28 2009 +0000
@@ -230,6 +230,11 @@ struct domain_mca_msrs
spinlock_t lock;
};
+struct time_scale {
+ int shift;
+ u32 mul_frac;
+};
+
struct arch_domain
{
#ifdef CONFIG_X86_64
@@ -298,10 +303,17 @@ struct arch_domain
/* For Guest vMCA handling */
struct domain_mca_msrs vmca_msrs;
- /* SoftTSC emulation */
- bool_t vtsc;
- s_time_t vtsc_last;
+ /* TSC management (emulation, pv, scaling, stats) */
+ int tsc_mode; /* see include/asm-x86/time.h */
+ bool_t vtsc; /* tsc is emulated (may change after migrate) */
+ s_time_t vtsc_last; /* previous TSC value (guarantee monotonicity) */
spinlock_t vtsc_lock;
+ uint64_t vtsc_offset; /* adjustment for save/restore/migrate */
+ uint32_t tsc_khz; /* cached khz for certain emulated cases */
+ struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */
+ struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */
+ uint32_t incarnation; /* incremented every restore or live migrate
+ (possibly other cases in the future */
uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
uint64_t vtsc_usercount; /* not used for hvm */
} __cacheline_aligned;
diff -r 12644e457c37 -r d53db6af369f xen/include/asm-x86/msr.h
--- a/xen/include/asm-x86/msr.h Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/include/asm-x86/msr.h Wed Nov 25 14:05:28 2009 +0000
@@ -84,6 +84,8 @@ static inline void wrmsrl(unsigned int m
#define write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
#define rdpmc(counter,low,high) \
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
diff -r 12644e457c37 -r d53db6af369f xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/include/asm-x86/processor.h Wed Nov 25 14:05:28 2009 +0000
@@ -550,8 +550,8 @@ void cpu_mcheck_distribute_cmci(void);
void cpu_mcheck_distribute_cmci(void);
void cpu_mcheck_disable(void);
-int cpuid_hypervisor_leaves(
- uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
diff -r 12644e457c37 -r d53db6af369f xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/include/asm-x86/time.h Wed Nov 25 14:05:28 2009 +0000
@@ -3,6 +3,24 @@
#define __X86_TIME_H__
#include <asm/msr.h>
+
+/*
+ * PV TSC emulation modes:
+ * 0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ * and emulated otherwise (with frequency scaled if necessary)
+ * 1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ * 2 = guest rdtsc always executed natively (no monotonicity/frequency
+ * guarantees); guest rdtscp emulated at native frequency if
+ * unsupported by h/w, else executed natively
+ * 3 = same as 2, except xen manages TSC_AUX register so guest can
+ * determine when a restore/migration has occurred and assumes
+ * guest obtains/uses pvclock-like mechanism to adjust for
+ * monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT 0
+#define TSC_MODE_ALWAYS_EMULATE 1
+#define TSC_MODE_NEVER_EMULATE 2
+#define TSC_MODE_PVRDTSCP 3
void calibrate_tsc_bp(void);
void calibrate_tsc_ap(void);
@@ -43,6 +61,16 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation);
+
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+ uint32_t *gtsc_khz, uint32_t *incarnation);
+
+
void force_update_vcpu_system_time(struct vcpu *v);
+void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
#endif /* __X86_TIME_H__ */
diff -r 12644e457c37 -r d53db6af369f xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Wed Nov 25 14:04:46 2009 +0000
+++ b/xen/include/public/domctl.h Wed Nov 25 14:05:28 2009 +0000
@@ -401,11 +401,6 @@ typedef struct xen_domctl_settimeoffset
typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
-#define XEN_DOMCTL_set_tsc_native 57
-typedef struct xen_domctl_set_tsc_native {
- uint32_t is_native; /* IN: 0: TSC is emulated; 1: TSC is host TSC */
-} xen_domctl_set_tsc_native_t;
-
#define XEN_DOMCTL_gethvmcontext 33
#define XEN_DOMCTL_sethvmcontext 34
typedef struct xen_domctl_hvmcontext {
@@ -655,6 +650,22 @@ typedef struct xen_domctl_disable_migrat
uint32_t disable; /* IN: 1: disable migration and restore */
} xen_domctl_disable_migrate_t;
+
+#define XEN_DOMCTL_gettscinfo 59
+#define XEN_DOMCTL_settscinfo 60
+struct xen_guest_tsc_info {
+ uint32_t tsc_mode;
+ uint32_t gtsc_khz;
+ uint32_t incarnation;
+ uint32_t pad;
+ uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+ XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+ xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
#define XEN_DOMCTL_gdbsx_guestmemio 1000 /* guest mem io */
struct xen_domctl_gdbsx_memio {
@@ -705,8 +716,8 @@ struct xen_domctl {
struct xen_domctl_hypercall_init hypercall_init;
struct xen_domctl_arch_setup arch_setup;
struct xen_domctl_settimeoffset settimeoffset;
- struct xen_domctl_set_tsc_native set_tsc_native;
struct xen_domctl_disable_migrate disable_migrate;
+ struct xen_domctl_tsc_info tsc_info;
struct xen_domctl_real_mode_area real_mode_area;
struct xen_domctl_hvmcontext hvmcontext;
struct xen_domctl_hvmcontext_partial hvmcontext_partial;
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|