WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [RFC] draft tsc_mode patch (to replace tsc_native)

To: "Xen-Devel (E-mail)" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [RFC] draft tsc_mode patch (to replace tsc_native)
From: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
Date: Thu, 12 Nov 2009 15:55:15 -0800 (PST)
Cc: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Delivery-date: Thu, 12 Nov 2009 15:59:13 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
This isn't done yet, but contains the core code for
implementing the mechanism I've been proposing for
handling "tsc_mode" (to replace tsc_native/vtsc),
so I thought I'd ask for some feedback hopefully before
reviewers leave for Xen Summit Asia.

The tsc_mode is set in the VM config file as:

#define TSC_MODE_DEFAULT          0
#define TSC_MODE_ALWAYS_EMULATE   1
#define TSC_MODE_NEVER_EMULATE    2
#define TSC_MODE_PVRDTSCP         3

0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
     and emulated otherwise (with frequency scaled if necessary)
1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
2 = guest rdtsc always executed natively (no monotonicity/frequency
     guarantees); guest rdtscp emulated at native frequency if
     unsupported by h/w, else executed natively
3 = same as 2, except xen manages TSC_AUX register so guest can
     determine when a restore/migration has occurred and assumes
     guest obtains/uses pvclock-like mechanism to adjust for
     monotonicity and frequency changes

Tsc_mode must be persistent across save/restore/migration.
In addition, an offset, a tsc_khz, and a "incarnation" counter
are deduced on creation, and then must be persistent across
save/restore/migrate (though some are ignored for some tsc modes).

A key improvement over the previous tsc_native implementation
is that if TSC is "safe", the default tsc_mode does not
use emulation until after the first save/restore/migrate
(mimicking the previous implementation by Intel for HVM).
Since a surprising number of machines are now "TSC safe"
and since most domains never get saved/migrated, the
vast majority of VMs will never suffer the performance
penalty of emulated TSC even though TSC correctness for
applications is still provided.

Note that I haven't removed the tsc_native code yet,
some functionality is still underway 
and there's still a fair amount of debug code to be
removed.

Thanks for any feedback!
Dan

diff -r 494ad84ad38c tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain.c   Thu Nov 12 16:48:53 2009 -0700
@@ -483,6 +483,52 @@ int xc_domain_disable_migrate(int xc_han
     domctl.u.disable_migrate.disable = 1;
     return do_domctl(xc_handle, &domctl);
 }
+
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation)
+{
+    DECLARE_DOMCTL;
+    domctl.cmd = XEN_DOMCTL_settscinfo;
+    domctl.domain = (domid_t)domid;
+    domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+    domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+    domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+    domctl.u.tsc_info.info.incarnation = incarnation;
+    return do_domctl(xc_handle, &domctl);
+}
+
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    xen_guest_tsc_info_t info = { 0 };
+
+    domctl.cmd = XEN_DOMCTL_gettscinfo;
+    domctl.domain = (domid_t)domid;
+    set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+    if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+        return rc;
+    rc = do_domctl(xc_handle, &domctl);
+    if ( rc == 0 )
+    {
+        *tsc_mode = info.tsc_mode;
+        *elapsed_nsec = info.elapsed_nsec;
+        *gtsc_khz = info.gtsc_khz;
+        *incarnation = info.incarnation;
+    }
+    unlock_pages(&info,sizeof(info));
+    return rc;
+}
+
 
 int xc_domain_memory_increase_reservation(int xc_handle,
                                           uint32_t domid,
diff -r 494ad84ad38c tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain_restore.c   Thu Nov 12 16:48:53 2009 -0700
@@ -1083,6 +1083,23 @@ static int pagebuf_get_one(pagebuf_t* bu
             ERROR("error reading/restoring tmem extra");
             return -1;
         }
+        return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( count == -7 ) {
+        uint32_t tsc_mode, khz, incarn;
+        uint64_t nsec;
+        if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+             read_exact(fd, &nsec, sizeof(uint64_t)) ||
+             read_exact(fd, &khz, sizeof(uint32_t)) ||
+             read_exact(fd, &incarn, sizeof(uint32_t)) ||
+             xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+            ERROR("error reading/restoring tmem extra");
+            return -1;
+        }
+        /* no PRIxxx formatting allowed here???
+        DPRINTF("tsc_info read: mode=%"PRIu32",ns=0x%"PRIx64","
+                "khz=%"PRIu32",incarn=%"PRIu32"\n",
+                tsc_mode, nsec, khz, incarn);
+        */
         return pagebuf_get_one(buf, fd, xch, dom);
     } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
         ERROR("Max batch size exceeded (%d). Giving up.", count);
diff -r 494ad84ad38c tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xc_domain_save.c      Thu Nov 12 16:48:53 2009 -0700
@@ -1099,6 +1099,28 @@ int xc_domain_save(int xc_handle, int io
         ERROR("Error when writing to state file (tmem)");
         goto out;
     }
+
+                {
+                    int marker = -7;
+                    uint32_t tsc_mode, khz, incarn;
+                    uint64_t nsec;
+                    if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+                                                &nsec, &khz, &incarn) < 0  ||
+                         write_exact(io_fd, &marker, sizeof(marker)) ||
+                         write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+                         write_exact(io_fd, &nsec, sizeof(nsec)) ||
+                         write_exact(io_fd, &khz, sizeof(khz)) ||
+                         write_exact(io_fd, &incarn, sizeof(incarn)) )
+                    {
+                        ERROR("Error when writing to state file (tsc)");
+                        goto out;
+                    }
+                    /* no PRIxxx formatting?
+                    DPRINTK("tsc_info written: mode=%"PRIu32",ns=0x%"PRIx64","
+                            "khz=%"PRIu32",incarn=%"PRIu32"\n",
+                            tsc_mode, nsec, khz, incarn);
+                    */
+                 }
 
   copypages:
 #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), 
(len))
@@ -1707,6 +1729,7 @@ int xc_domain_save(int xc_handle, int io
             PERROR("Error when writing to state file (2)");
             goto out;
         }
+
     }
 
     /*
diff -r 494ad84ad38c tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/libxc/xenctrl.h     Thu Nov 12 16:48:53 2009 -0700
@@ -627,6 +627,20 @@ int xc_domain_set_time_offset(int xc_han
 int xc_domain_set_time_offset(int xc_handle,
                               uint32_t domid,
                               int32_t time_offset_seconds);
+
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation);
 
 int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
 
diff -r 494ad84ad38c tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu Nov 12 16:48:53 2009 -0700
@@ -1479,6 +1479,20 @@ static PyObject *pyxc_domain_set_tsc_nat
         return NULL;
 
     if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+        return pyxc_error_to_exception();
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_domain_set_tsc_mode(XcObject *self, PyObject *args)
+{
+    uint32_t dom, tsc_mode;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
+        return NULL;
+
+    if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
         return pyxc_error_to_exception();
 
     Py_INCREF(zero);
@@ -2029,6 +2043,15 @@ static PyMethodDef pyxc_methods[] = {
       " is_native  [int]: 1=native, 0=emulate.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "domain_set_tsc_mode",
+      (PyCFunction)pyxc_domain_set_tsc_mode,
+      METH_VARARGS, "\n"
+      "Set a domain's TSC mode\n"
+      " dom        [int]: Domain whose TSC mode is being set.\n"
+      " tsc_mode   [int]: 0=default (monotonic, but native where possible)\n"
+      "                   1=always emulate 2=never emulate 3=pvrdtscp\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
     { "domain_disable_migrate",
       (PyCFunction)pyxc_domain_disable_migrate,
       METH_VARARGS, "\n"
diff -r 494ad84ad38c tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xend/XendConfig.py       Thu Nov 12 16:48:53 2009 -0700
@@ -164,6 +164,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'vnclisten': str,
     'timer_mode': int,
     'tsc_native': int,
+    'tsc_mode': int,
     'vpt_align': int,
     'viridian': int,
     'vncpasswd': str,
@@ -479,6 +480,9 @@ class XendConfig(dict):
 
         if 'tsc_native' not in self['platform']:
             self['platform']['tsc_native'] = 0
+
+        if 'tsc_mode' not in self['platform']:
+            self['platform']['tsc_mode'] = 0
 
         if 'nomigrate' not in self['platform']:
             self['platform']['nomigrate'] = 0
diff -r 494ad84ad38c tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py   Thu Nov 12 16:48:53 2009 -0700
@@ -2467,6 +2467,11 @@ class XendDomainInfo:
         if arch.type == "x86" and tsc_native is not None:
             xc.domain_set_tsc_native(self.domid, int(tsc_native))
 
+        # Set TSC mode of domain
+        tsc_mode = self.info["platform"].get("tsc_mode")
+        if arch.type == "x86" and tsc_mode is not None:
+            xc.domain_set_tsc_native(self.domid, int(tsc_mode))
+
         # Set timer configuration of domain
         timer_mode = self.info["platform"].get("timer_mode")
         if hvm and timer_mode is not None:
diff -r 494ad84ad38c tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xm/create.py     Thu Nov 12 16:48:53 2009 -0700
@@ -224,6 +224,10 @@ gopts.var('tsc_native', val='TSC_NATIVE'
 gopts.var('tsc_native', val='TSC_NATIVE',
           fn=set_int, default=0,
           use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+
+gopts.var('tsc_mode', val='TSC_MODE',
+          fn=set_int, default=0,
+          use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 
3=pvrdtscp).""")
 
 gopts.var('nomigrate', val='NOMIGRATE',
           fn=set_int, default=0,
@@ -741,6 +745,10 @@ def configure_image(vals):
     if vals.tsc_native is not None:
         config_image.append(['tsc_native', vals.tsc_native])
 
+    # DJM DJM don't think I need this???
+    #if vals.tsc_mode is not None:
+    #    config_image.append(['tsc_mode', vals.tsc_mode])
+
     if vals.nomigrate is not None:
         config_image.append(['nomigrate', vals.nomigrate])
 
@@ -1027,7 +1035,7 @@ def make_config(vals):
                 config.append([n, v])
 
     map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
-                   'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+                   'restart', 'on_poweroff', 'tsc_native', 'tsc_mode', 
'nomigrate',
                    'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
                    'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
                    'cpuid_check', 'machine_address_size', 
'suppress_spurious_page_faults'])
diff -r 494ad84ad38c tools/python/xen/xm/xenapi_create.py
--- a/tools/python/xen/xm/xenapi_create.py      Mon Nov 09 22:41:23 2009 +0000
+++ b/tools/python/xen/xm/xenapi_create.py      Thu Nov 12 16:48:53 2009 -0700
@@ -1078,6 +1078,7 @@ class sxp2xml:
             'pci_power_mgmt',
             'xen_platform_pci',
             'tsc_native'
+            'tsc_mode'
             'description',
             'nomigrate'
         ]
diff -r 494ad84ad38c xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/domain.c     Thu Nov 12 16:48:53 2009 -0700
@@ -520,6 +520,9 @@ int arch_domain_create(struct domain *d,
         d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
     }
 
+    /* initialize default tsc behavior in case tools don't */
+if (d->domain_id) //DJM
+    tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
     spin_lock_init(&d->arch.vtsc_lock);
 
     return 0;
diff -r 494ad84ad38c xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/domctl.c     Thu Nov 12 16:48:53 2009 -0700
@@ -1111,9 +1111,65 @@ long arch_do_domctl(
             break;
 
         domain_pause(d);
-        d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
-        if ( is_hvm_domain(d) )
-            hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+printk("DJM DJM ignoring set_tsc_native for now\n");
+        //d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
+        //if ( is_hvm_domain(d) )
+            //hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+        domain_unpause(d);
+
+        rcu_unlock_domain(d);
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_gettscinfo:
+    {
+        struct domain *d;
+        xen_guest_tsc_info_t info;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        domain_pause(d);
+        tsc_get_info(d, &info.tsc_mode,
+                        &info.elapsed_nsec,
+                        &info.gtsc_khz,
+                        &info.incarnation);
+        /* tsc_get_info(d, &domctl->u.tsc_info.tsc_mode,
+                        &domctl->u.tsc_info.elapsed_nsec,
+                        &domctl->u.tsc_info.gtsc_khz,
+                        &domctl->u.tsc_info.incarnation);*/
+        //if ( raw_copy_to_guest(domctl->u.tsc_info.out_info.p, &info, 
sizeof(info)) )
+        if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+{
+printk("XEN_DOMCTL_gettscinfo: copy failed, out_info=%p\n",
+domctl->u.tsc_info.out_info.p);
+            ret = -EFAULT;
+}
+        else
+            ret = 0;
+        domain_unpause(d);
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_settscinfo:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        domain_pause(d);
+        tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+                     domctl->u.tsc_info.info.elapsed_nsec,
+                     domctl->u.tsc_info.info.gtsc_khz,
+                     domctl->u.tsc_info.info.incarnation);
         domain_unpause(d);
 
         rcu_unlock_domain(d);
diff -r 494ad84ad38c xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/arch/x86/time.c       Thu Nov 12 16:48:53 2009 -0700
@@ -818,6 +818,7 @@ static void __update_vcpu_system_time(st
     struct cpu_time       *t;
     struct vcpu_time_info *u, _u;
     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+    s_time_t vtsc_stamp = 0;
 
     if ( v->vcpu_info == NULL )
         return;
@@ -825,9 +826,15 @@ static void __update_vcpu_system_time(st
     t = &this_cpu(cpu_time);
     u = &vcpu_info(v, time);
 
+    if ( v->domain->arch.vtsc )
+        /* FIXME: need scaling here too? */
+        vtsc_stamp = t->stime_local_stamp - v->domain->arch.vtsc_offset;
+    else if ( v->domain->arch.pvrdtscp )
+        /* FIXME: write tsc_aux here? */;
+
     /* Don't bother unless timestamps have changed or we are forced. */
     if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
-                                         ? t->stime_local_stamp
+                                         ? vtsc_stamp
                                          : t->local_tsc_stamp)) )
         return;
 
@@ -835,8 +842,8 @@ static void __update_vcpu_system_time(st
 
     if ( v->domain->arch.vtsc )
     {
-        _u.tsc_timestamp     = t->stime_local_stamp;
-        _u.system_time       = t->stime_local_stamp;
+        _u.tsc_timestamp     = vtsc_stamp;
+        _u.system_time       = vtsc_stamp;
         _u.tsc_to_system_mul = 0x80000000u;
         _u.tsc_shift         = 1;
     }
@@ -1598,8 +1605,126 @@ void pv_soft_rdtsc(struct vcpu *v, struc
 
     spin_unlock(&v->domain->arch.vtsc_lock);
 
+    now -= v->domain->arch.vtsc_offset;
+    if ( v->domain->arch.vtsc_shift != 1 ||
+         v->domain->arch.vtsc_mul_frac != 0x80000000u )
+    {
+/* FIXME
+        struct time_scale scale;
+        scale.shift = v->domain->arch.vtsc_shift;
+        scale.mul_frac = v->domain->arch.vtsc_mul_frac;
+        now = scale_delta(now, &scale);
+*/
+    }
+
     regs->eax = (uint32_t)now;
     regs->edx = (uint32_t)(now >> 32);
+}
+
+int host_tsc_is_safe(void)
+{
+    extern unsigned int max_cstate;
+
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+        return 1;
+    if ( num_online_cpus() == 1 )
+        return 1;
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+        return 1;
+    return 0;
+}
+
+/* called to collect tsc-related data only for save file or live migrate */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+                  uint32_t *gtsc_khz, uint32_t *incarnation)
+{
+    switch ( *tsc_mode = d->arch.tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        *elapsed_nsec =  *gtsc_khz = *incarnation = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz = 1000000UL;
+         break;
+    case TSC_MODE_DEFAULT:
+        *incarnation = d->arch.incarnation;
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz =  ( d->arch.vtsc == 0 ) ?  cpu_khz : 1000000UL;
+        break;
+    case TSC_MODE_PVRDTSCP:
+        *incarnation = d->arch.incarnation;
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz =  d->arch.tsc_khz;
+        break;
+    }
+printk("DJM tsc_get_info got: dom%u,mode=%d,ns=0x%lx,khz=%d,inc=%d\n",
+d->domain_id,(int)*tsc_mode,(long)*elapsed_nsec,(int)*gtsc_khz,(int)*incarnation);
+}
+
+/* called to set tsc-related data only on restore or target of live migrate */
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation)
+{
+    struct time_scale scale;
+
+printk("DJM tsc_set_info before: dom%u,mode=%d,ns=0x%lx,khz=%d,inc=%d\n",
+d->domain_id,(int)tsc_mode,(long)elapsed_nsec,(int)gtsc_khz,(int)incarnation);
+    switch ( d->arch.tsc_mode = tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        d->arch.vtsc = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        d->arch.vtsc = 1;
+        d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+        d->arch.vtsc_shift = 1;
+        d->arch.vtsc_mul_frac = 0x80000000U;
+        break;
+    case TSC_MODE_DEFAULT:
+        if ( host_tsc_is_safe() && incarnation == 0 )
+        {
+            d->arch.vtsc = 0;
+            d->arch.incarnation = 1;
+            d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+            /* d->arch.vtsc_shift/mul_frac/offset will not be used */
+        } else if ( gtsc_khz != 0  && gtsc_khz != 1000000 ) {
+printk("DJM tsc_set_info A: khz=%d\n",gtsc_khz);
+            d->arch.vtsc = 1;
+            set_time_scale(&scale, gtsc_khz * 1000 );
+            d->arch.vtsc_shift = scale.shift;
+            d->arch.vtsc_mul_frac = scale.mul_frac;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec; /* FIXME? */
+        } else {
+            d->arch.vtsc = 1;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+            d->arch.vtsc_shift = 1;
+            d->arch.vtsc_mul_frac = 0x80000000U;
+            d->arch.incarnation = incarnation + 1;
+printk("DJM tsc_set_info B: offset=%ld\n",(long)d->arch.vtsc_offset);
+        }
+        break;
+    case TSC_MODE_PVRDTSCP:
+        /* if (hardware supports rdtscp instruction) FIXME */
+            d->arch.pvrdtscp = 1;
+        d->arch.vtsc = 0;
+        if ( gtsc_khz != 0 ) {
+             set_time_scale(&scale, gtsc_khz * 1000 );
+             d->arch.vtsc_shift = scale.shift;
+            d->arch.vtsc_mul_frac = scale.mul_frac;
+        } else {
+            d->arch.vtsc = 1;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+            d->arch.vtsc_shift = 1;
+            d->arch.vtsc_mul_frac = 0x80000000U;
+            d->arch.incarnation = incarnation + 1;
+        }
+        break;
+    }
+    if ( is_hvm_domain(d) )
+        hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+printk("DJM tsc_set_info after: 
dom%u,vtsc=%d,ofs=0x%lx,sh=%d,mulfrac=0x%08x,inc=%d\n",
+d->domain_id,(int)d->arch.vtsc,(long)d->arch.vtsc_offset,(int)d->arch.vtsc_shift,(int)d->arch.vtsc_mul_frac,(int)d->arch.incarnation);
 }
 
 /* vtsc may incur measurable performance degradation, diagnose with this */
@@ -1607,33 +1732,51 @@ static void dump_softtsc(unsigned char k
 {
     struct domain *d;
     int domcnt = 0;
+    extern unsigned int max_cstate;
 
     tsc_check_reliability();
     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
         printk("TSC marked as reliable, "
                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
-        printk("TSC marked as constant but not reliable, "
-               "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
-    else
+    {
+        printk("TSC has constant rate, ");
+        if (max_cstate <= 2)
+            printk("no deep Cstates possible, so deemed reliable, ");
+        else
+            printk("deep Cstates possible, so not reliable, ");
+        printk("warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+    } else
         printk("TSC not marked as either constant or reliable, "
                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     for_each_domain ( d )
     {
+        if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+            continue;
+        printk("dom%u%s: mode=%d",d->domain_id,
+                is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+        if ( d->arch.vtsc_offset )
+            printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+        if ( d->arch.tsc_khz )
+            printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+        if ( d->arch.incarnation )
+            printk(",inc=%"PRIu32"",d->arch.incarnation);
         if ( !d->arch.vtsc )
+        {
+            printk("\n");
             continue;
+        }
         if ( is_hvm_domain(d) )
-            printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
-                   d->domain_id, d->arch.vtsc_kerncount);
+            printk(",vtsc count: %"PRIu64" total\n",
+                   d->arch.vtsc_kerncount);
         else
-            printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
-                   d->domain_id, d->arch.vtsc_kerncount,
-                   d->arch.vtsc_usercount);
+            printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+                   d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
         domcnt++;
     }
 
     if ( !domcnt )
-            printk("All domains have native TSC\n");
+            printk("No domains have emulated TSC\n");
 }
 
 static struct keyhandler dump_softtsc_keyhandler = {
diff -r 494ad84ad38c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/asm-x86/domain.h      Thu Nov 12 16:48:53 2009 -0700
@@ -299,9 +299,17 @@ struct arch_domain
     struct domain_mca_msrs vmca_msrs;
 
     /* SoftTSC emulation */
-    bool_t vtsc;
-    s_time_t vtsc_last;
+    int tsc_mode;            /* see include/asm-x86/time.h */
+    bool_t vtsc;             /* 1 == enable tsc emulation */
+    bool_t pvrdtscp;         /* set TSC_AUX to incarnation on all vcpus */
+    s_time_t vtsc_last;      /* previous value (to guarantee monotonicity) */
     spinlock_t vtsc_lock;
+    uint64_t vtsc_offset;    /* adjustment for save/restore/migrate */
+    uint32_t tsc_khz;        /* cached khz for certain emulated cases */
+    uint32_t vtsc_shift;     /* cached scaling for certain emulated cases */
+    uint32_t vtsc_mul_frac;  /* cached scaling for certain emulated cases */
+    uint32_t incarnation;    /* incremented every restore or live migrate
+                                (possibly other cases in the future */
     uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
     uint64_t vtsc_usercount; /* not used for hvm */
 } __cacheline_aligned;
diff -r 494ad84ad38c xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/asm-x86/time.h        Thu Nov 12 16:48:53 2009 -0700
@@ -3,6 +3,24 @@
 #define __X86_TIME_H__
 
 #include <asm/msr.h>
+
+/*
+ *  PV TSC emulation modes:
+ *    0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ *         and emulated otherwise (with frequency scaled if necessary)
+ *    1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ *    2 = guest rdtsc always executed natively (no monotonicity/frequency
+ *         guarantees); guest rdtscp emulated at native frequency if
+ *         unsupported by h/w, else executed natively
+ *    3 = same as 2, except xen manages TSC_AUX register so guest can
+ *         determine when a restore/migration has occurred and assumes
+ *         guest obtains/uses pvclock-like mechanism to adjust for
+ *         monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT          0
+#define TSC_MODE_ALWAYS_EMULATE   1
+#define TSC_MODE_NEVER_EMULATE    2
+#define TSC_MODE_PVRDTSCP         3
 
 void calibrate_tsc_bp(void);
 void calibrate_tsc_ap(void);
@@ -43,6 +61,13 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns)
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation);
+   
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+                  uint32_t *gtsc_khz, uint32_t *incarnation);
+   
+
 void force_update_vcpu_system_time(struct vcpu *v);
 
 #endif /* __X86_TIME_H__ */
diff -r 494ad84ad38c xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Mon Nov 09 22:41:23 2009 +0000
+++ b/xen/include/public/domctl.h       Thu Nov 12 16:48:53 2009 -0700
@@ -656,6 +656,22 @@ typedef struct xen_domctl_disable_migrat
 } xen_domctl_disable_migrate_t;
 
 
+#define XEN_DOMCTL_gettscinfo    59
+#define XEN_DOMCTL_settscinfo    60
+struct xen_guest_tsc_info {
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint32_t pad;
+    uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+    xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
 #define XEN_DOMCTL_gdbsx_guestmemio     1000 /* guest mem io */
 struct xen_domctl_gdbsx_memio {
     uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
@@ -707,6 +723,7 @@ struct xen_domctl {
         struct xen_domctl_settimeoffset     settimeoffset;
         struct xen_domctl_set_tsc_native    set_tsc_native;
         struct xen_domctl_disable_migrate   disable_migrate;
+        struct xen_domctl_tsc_info          tsc_info;
         struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
         struct xen_domctl_hvmcontext_partial hvmcontext_partial;



Attachment: tscmode-draft.patch
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel