WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] First cut of new time interfaces and synchronisation mec

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] First cut of new time interfaces and synchronisation mechanisms.
From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 18 Jul 2005 16:22:10 -0400
Delivery-date: Mon, 18 Jul 2005 20:22:48 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 43564304cf9448ad8978df6d2d0d6721b4615143
# Parent  9697bc63d4039196b15378f3b3fe406c6a445ea2

First cut of new time interfaces and synchronisation mechanisms.
Based on an initial patch from Don Fry at IBM.
Still TODO: 
 1. Testing
 2. NTP synchronisation
 3. Fix wallclock interface a bit
 4. Support for platform timers other than PIT (e.g., HPET, IBM Cyclone)
 5. Scale 64-bit TSC diffs instead of 32-bit, just for sanity
 6. Error-correcting scale factor is still slightly wrong
 6. More testing
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>

diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/apic.c
--- a/xen/arch/x86/apic.c       Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/apic.c       Mon Jul 18 20:22:11 2005
@@ -723,16 +723,8 @@
 static void __init setup_APIC_timer(unsigned int clocks)
 {
     unsigned long flags;
-    
     local_irq_save(flags);
-
-    /*
-     * Wait for IRQ0's slice:
-     */
-    wait_timer_tick();
-
     __setup_APIC_LVTT(clocks);
-
     local_irq_restore(flags);
 }
 
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile        Sun Jul 17 
14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile        Mon Jul 18 
20:22:11 2005
@@ -19,7 +19,7 @@
 s-obj-y        :=
 
 obj-y                          += cpu/
-obj-y                          += timers/
+#obj-y                         += timers/
 obj-$(CONFIG_ACPI_BOOT)                += acpi/
 #c-obj-$(CONFIG_X86_BIOS_REBOOT)       += reboot.o
 c-obj-$(CONFIG_MCA)            += mca.o
diff -r 9697bc63d403 -r 43564304cf94 xen/common/domain.c
--- a/xen/common/domain.c       Sun Jul 17 14:16:21 2005
+++ b/xen/common/domain.c       Mon Jul 18 20:22:11 2005
@@ -42,8 +42,6 @@
     d->domain_id   = dom_id;
     v->processor  = cpu;
  
-    spin_lock_init(&d->time_lock);
-
     spin_lock_init(&d->big_lock);
 
     spin_lock_init(&d->page_alloc_lock);
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/vmx_intercept.c
--- a/xen/arch/x86/vmx_intercept.c      Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/vmx_intercept.c      Mon Jul 18 20:22:11 2005
@@ -24,10 +24,10 @@
 #include <asm/vmx_virpit.h>
 #include <asm/vmx_intercept.h>
 #include <public/io/ioreq.h>
-
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <io_ports.h>
 
 #ifdef CONFIG_VMX
 
@@ -175,7 +175,7 @@
         p->port_mm)
         return 0;
     
-    if (p->addr == 0x43 &&
+    if (p->addr == PIT_MODE &&
        p->dir == 0 &&                          /* write */
         ((p->u.data >> 4) & 0x3) == 0 &&       /* latch command */
         ((p->u.data >> 6) & 0x3) == (vpit->channel)) {/* right channel */
@@ -183,7 +183,7 @@
        return 1;
     }
 
-    if (p->addr == (0x40 + vpit->channel) &&
+    if (p->addr == (PIT_CH0 + vpit->channel) &&
        p->dir == 1) {  /* read */
         p->u.data = pit_read_io(vpit);
         resume_pit_io(p);
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c      Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/i8259.c      Mon Jul 18 20:22:11 2005
@@ -19,7 +19,7 @@
 #include <asm/bitops.h>
 #include <xen/delay.h>
 #include <asm/apic.h>
-
+#include <io_ports.h>
 
 /*
  * Common place to define all x86 IRQ vectors
@@ -395,9 +395,9 @@
     /* Set the clock to HZ Hz */
 #define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */
 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
-    outb_p(0x34,0x43);           /* binary, mode 2, LSB/MSB, ch 0 */
-    outb_p(LATCH & 0xff , 0x40); /* LSB */
-    outb(LATCH >> 8 , 0x40);     /* MSB */
+    outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
+    outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
+    outb(LATCH >> 8, PIT_CH0);     /* MSB */
 
     setup_irq(2, &cascade);
 }
diff -r 9697bc63d403 -r 43564304cf94 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Sun Jul 17 14:16:21 2005
+++ b/xen/common/page_alloc.c   Mon Jul 18 20:22:11 2005
@@ -351,10 +351,10 @@
 void scrub_heap_pages(void)
 {
     void *p;
-    unsigned long pfn, flags;
+    unsigned long pfn;
+    int cpu = smp_processor_id();
 
     printk("Scrubbing Free RAM: ");
-    watchdog_disable();
 
     for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ )
     {
@@ -362,12 +362,15 @@
         if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
 
+        if ( unlikely(softirq_pending(cpu)) )
+            do_softirq();
+
         /* Quick lock-free check. */
         if ( allocated_in_map(pfn) )
             continue;
-        
-        spin_lock_irqsave(&heap_lock, flags);
-        
+
+        spin_lock_irq(&heap_lock);
+
         /* Re-check page status with lock held. */
         if ( !allocated_in_map(pfn) )
         {
@@ -385,11 +388,10 @@
                 unmap_domain_page(p);
             }
         }
-        
-        spin_unlock_irqrestore(&heap_lock, flags);
-    }
-
-    watchdog_enable();
+
+        spin_unlock_irq(&heap_lock);
+    }
+
     printk("done.\n");
 }
 
diff -r 9697bc63d403 -r 43564304cf94 xen/common/ac_timer.c
--- a/xen/common/ac_timer.c     Sun Jul 17 14:16:21 2005
+++ b/xen/common/ac_timer.c     Mon Jul 18 20:22:11 2005
@@ -202,7 +202,7 @@
     do {
         heap = ac_timers[cpu].heap;
         now  = NOW();
-        
+
         while ( (GET_HEAP_SIZE(heap) != 0) &&
                 ((t = heap[1])->expires < (now + TIMER_SLOP)) )
         {
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/smpboot.c    Mon Jul 18 20:22:11 2005
@@ -40,6 +40,7 @@
 #include <xen/sched.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
+#include <xen/softirq.h>
 #include <asm/current.h>
 #include <asm/mc146818rtc.h>
 #include <asm/desc.h>
@@ -406,6 +407,7 @@
         */
        if (cpu_has_tsc && cpu_khz)
                synchronize_tsc_ap();
+       calibrate_tsc_ap();
 }
 
 int cpucount;
@@ -464,6 +466,8 @@
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
+
+        init_percpu_time();
 
        wmb();
        startup_cpu_idle_loop();
@@ -1149,6 +1153,7 @@
         */
        if (cpu_has_tsc && cpucount && cpu_khz)
                synchronize_tsc_bp();
+       calibrate_tsc_bp();
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
@@ -1167,22 +1172,21 @@
 int __devinit __cpu_up(unsigned int cpu)
 {
        /* This only works at boot for x86.  See "rewrite" above. */
-       if (cpu_isset(cpu, smp_commenced_mask)) {
-               local_irq_enable();
+       if (cpu_isset(cpu, smp_commenced_mask))
                return -ENOSYS;
-       }
 
        /* In case one didn't come up */
-       if (!cpu_isset(cpu, cpu_callin_map)) {
-               local_irq_enable();
+       if (!cpu_isset(cpu, cpu_callin_map))
                return -EIO;
-       }
-
-       local_irq_enable();
+
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
-       while (!cpu_isset(cpu, cpu_online_map))
+       while (!cpu_isset(cpu, cpu_online_map)) {
                mb();
+               if (softirq_pending(0))
+                       do_softirq();
+       }
+
        return 0;
 }
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Sun Jul 17 14:16:21 2005
+++ b/xen/include/xen/sched.h   Mon Jul 18 20:22:11 2005
@@ -92,7 +92,6 @@
     domid_t          domain_id;
 
     shared_info_t   *shared_info;     /* shared data area */
-    spinlock_t       time_lock;
 
     spinlock_t       big_lock;
 
diff -r 9697bc63d403 -r 43564304cf94 xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Sun Jul 17 14:16:21 2005
+++ b/xen/drivers/char/console.c        Mon Jul 18 20:22:11 2005
@@ -635,8 +635,6 @@
 
     debugtrace_bytes = bytes;
 
-    memset(debugtrace_buf, '\0', debugtrace_bytes);
-
     return 0;
 }
 __initcall(debugtrace_init);
diff -r 9697bc63d403 -r 43564304cf94 xen/include/xen/time.h
--- a/xen/include/xen/time.h    Sun Jul 17 14:16:21 2005
+++ b/xen/include/xen/time.h    Mon Jul 18 20:22:11 2005
@@ -30,7 +30,8 @@
 #include <public/xen.h>
 #include <asm/time.h>
 
-extern int init_xen_time();
+extern int init_xen_time(void);
+extern void init_percpu_time(void);
 
 extern unsigned long cpu_khz;
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Sun Jul 17 14:16:21 2005
+++ b/xen/include/public/xen.h  Mon Jul 18 20:22:11 2005
@@ -329,12 +329,36 @@
 #endif
 } vcpu_info_t;
 
+typedef struct vcpu_time_info {
+    /*
+     * The following values are updated periodically (and not necessarily
+     * atomically!). The guest OS detects this because 'time_version1' is
+     * incremented just before updating these values, and 'time_version2' is
+     * incremented immediately after. See the Xen-specific Linux code for an
+     * example of how to read these values safely (arch/xen/kernel/time.c).
+     */
+    u32 time_version1;
+    u32 time_version2;
+    u64 tsc_timestamp;   /* TSC at last update of time vals.  */
+    u64 system_time;     /* Time, in nanosecs, since boot.    */
+    /*
+     * Current system time:
+     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
+     * CPU frequency (Hz):
+     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+     */
+    u32 tsc_to_system_mul;
+    s8  tsc_shift;
+} vcpu_time_info_t;
+
 /*
  * Xen/kernel shared data -- pointer provided in start_info.
  * NB. We expect that this struct is smaller than a page.
  */
 typedef struct shared_info {
     vcpu_info_t vcpu_data[MAX_VIRT_CPUS];
+
+    vcpu_time_info_t vcpu_time[MAX_VIRT_CPUS];
 
     u32 n_vcpu;
 
@@ -373,33 +397,11 @@
     u32 evtchn_mask[32];
 
     /*
-     * Time: The following abstractions are exposed: System Time, Clock Time,
-     * Domain Virtual Time. Domains can access Cycle counter time directly.
+     * Wallclock time: updated only by control software. Guests should base
+     * their gettimeofday() syscall on this wallclock-base value.
      */
-    u64                cpu_freq;        /* CPU frequency (Hz).          */
-
-    /*
-     * The following values are updated periodically (and not necessarily
-     * atomically!). The guest OS detects this because 'time_version1' is
-     * incremented just before updating these values, and 'time_version2' is
-     * incremented immediately after. See the Xen-specific Linux code for an
-     * example of how to read these values safely (arch/xen/kernel/time.c).
-     */
-    u32                time_version1;
-    u32                time_version2;
-    tsc_timestamp_t    tsc_timestamp;   /* TSC at last update of time vals.  */
-    u64                system_time;     /* Time, in nanosecs, since boot.    */
     u32                wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
     u32                wc_usec;         /* Usecs 00:00:00 UTC, Jan 1, 1970.  */
-    u64                domain_time;     /* Domain virtual time, in nanosecs. */
-
-    /*
-     * Timeout values:
-     * Allow a domain to specify a timeout value in system time and 
-     * domain virtual time.
-     */
-    u64                wall_timeout;
-    u64                domain_timeout;
 
     arch_shared_info_t arch;
 
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Sun Jul 17 
14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile      Mon Jul 18 
20:22:11 2005
@@ -15,7 +15,7 @@
                ptrace.o quirks.o syscall.o bootflag.o
 
 i386-obj-y                     := time.o
-obj-y                          += ../../i386/kernel/timers/
+#obj-y                         += ../../i386/kernel/timers/
 
 s-obj-y        :=
 
diff -r 9697bc63d403 -r 43564304cf94 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Sun Jul 17 14:16:21 2005
+++ b/xen/arch/x86/time.c       Mon Jul 18 20:22:11 2005
@@ -1,16 +1,12 @@
-/****************************************************************************
- * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2002-2003 University of Cambridge
- ****************************************************************************
- *
- *        File: i386/time.c
- *      Author: Rolf Neugebar & Keir Fraser
- */
-
-/*
- *  linux/arch/i386/kernel/time.c
- *
- *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+/******************************************************************************
+ * arch/x86/time.c
+ * 
+ * Per-CPU time calibration and management.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * Portions from Linux are:
+ * Copyright (c) 1991, 1992, 1995  Linus Torvalds
  */
 
 #include <xen/config.h>
@@ -31,29 +27,74 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <asm/mc146818rtc.h>
-
-/* GLOBAL */
+#include <asm/div64.h>
+#include <io_ports.h>
+
 unsigned long cpu_khz;  /* CPU clock frequency in kHz. */
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 int timer_ack = 0;
 unsigned long volatile jiffies;
-
-/* PRIVATE */
-static unsigned int    rdtsc_bitshift;  /* Which 32 bits of TSC do we use?   */
-static u64             cpu_freq;        /* CPU frequency (Hz)                */
-static u32             st_scale_f;      /* Cycles -> ns, fractional part     */
-static u32             st_scale_i;      /* Cycles -> ns, integer part        */
-static u32             shifted_tsc_irq; /* CPU0's TSC at last 'time update'  */
-static u64             full_tsc_irq;    /* ...ditto, but all 64 bits         */
-static s_time_t        stime_irq;       /* System time at last 'time update' */
-static unsigned long   wc_sec, wc_usec; /* UTC time at last 'time update'.   */
-static rwlock_t        time_lock = RW_LOCK_UNLOCKED;
+static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */
+
+struct time_scale {
+    int shift;
+    u32 mul_frac;
+};
+
+struct cpu_time {
+    u64 local_tsc_stamp;
+    s_time_t stime_local_stamp;
+    s_time_t stime_master_stamp;
+    struct time_scale tsc_scale;
+    struct ac_timer calibration_timer;
+} __cacheline_aligned;
+
+static struct cpu_time cpu_time[NR_CPUS];
+
+/* Protected by platform_timer_lock. */
+static s_time_t stime_platform_stamp;
+static u64 platform_timer_stamp;
+static struct time_scale platform_timer_scale;
+static spinlock_t platform_timer_lock = SPIN_LOCK_UNLOCKED;
+
+static inline u32 down_shift(u64 time, int shift)
+{
+    if ( shift < 0 )
+        return (u32)(time >> -shift);
+    return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit division of integer dividend and integer divisor yielding
+ * 32-bit fractional quotient.
+ */
+static inline u32 div_frac(u32 dividend, u32 divisor)
+{
+    u32 quotient, remainder;
+    ASSERT(dividend < divisor);
+    __asm__ ( 
+        "div %4"
+        : "=a" (quotient), "=d" (remainder)
+        : "0" (0), "1" (dividend), "r" (divisor) );
+    return quotient;
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+    u32 product_int, product_frac;
+    __asm__ (
+        "mul %3"
+        : "=a" (product_frac), "=d" (product_int)
+        : "0" (multiplicand), "r" (multiplier) );
+    return product_int;
+}
 
 void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
 {
-    write_lock_irq(&time_lock);
-
-#ifdef CONFIG_X86_IO_APIC
     if ( timer_ack ) 
     {
         extern spinlock_t i8259A_lock;
@@ -63,30 +104,9 @@
         inb(0x20);
         spin_unlock(&i8259A_lock);
     }
-#endif
     
-    /*
-     * Updates TSC timestamp (used to interpolate passage of time between
-     * interrupts).
-     */
-    rdtscll(full_tsc_irq);
-    shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift);
-
     /* Update jiffies counter. */
     (*(unsigned long *)&jiffies)++;
-
-    /* Update wall time. */
-    wc_usec += 1000000/HZ;
-    if ( wc_usec >= 1000000 )
-    {
-        wc_usec -= 1000000;
-        wc_sec++;
-    }
-
-    /* Updates system time (nanoseconds since boot). */
-    stime_irq += MILLISECS(1000/HZ);
-
-    write_unlock_irq(&time_lock);
 
     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
     if ( !cpu_has_apic )
@@ -103,9 +123,9 @@
 #define CALIBRATE_FRAC  20      /* calibrate over 50ms */
 #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC)
 
-static unsigned long __init calibrate_tsc(void)
-{
-    u64 start, end, diff;
+static u64 calibrate_boot_tsc(void)
+{
+    u64 start, end;
     unsigned long count;
 
     /* Set the Gate high, disable speaker */
@@ -118,9 +138,9 @@
      * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB)
      * to begin countdown.
      */
-    outb(0xb0, 0x43);           /* binary, mode 0, LSB/MSB, Ch 2 */
-    outb(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
-    outb(CALIBRATE_LATCH >> 8, 0x42);   /* MSB of count */
+    outb(0xb0, PIT_MODE);           /* binary, mode 0, LSB/MSB, Ch 2 */
+    outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
+    outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
 
     rdtscll(start);
     for ( count = 0; (inb(0x61) & 0x20) == 0; count++ )
@@ -131,15 +151,147 @@
     if ( count == 0 )
         return 0;
 
-    diff = end - start;
-
-#if defined(__i386__)
-    /* If quotient doesn't fit in 32 bits then we return error (zero). */
-    if ( diff & ~0xffffffffULL )
-        return 0;
-#endif
-
-    return (unsigned long)diff;
+    return ((end - start) * (u64)CALIBRATE_FRAC);
+}
+
+static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
+{
+    u64 tps64 = ticks_per_sec;
+    u32 tps32;
+    int shift = 0;
+
+    while ( tps64 > (MILLISECS(1000)*2) )
+    {
+        tps64 >>= 1;
+        shift--;
+    }
+
+    tps32 = (u32)tps64;
+    while ( tps32 < (u32)MILLISECS(1000) )
+    {
+        tps32 <<= 1;
+        shift++;
+    }
+
+    ts->mul_frac = div_frac(MILLISECS(1000), tps32);
+    ts->shift    = shift;
+}
+
+static atomic_t tsc_calibrate_gang = ATOMIC_INIT(0);
+static unsigned int tsc_calibrate_status = 0;
+
+void calibrate_tsc_bp(void)
+{
+    while ( atomic_read(&tsc_calibrate_gang) != (num_booting_cpus() - 1) )
+        mb();
+
+    outb(CALIBRATE_LATCH & 0xff, PIT_CH2);
+    outb(CALIBRATE_LATCH >> 8, PIT_CH2);
+
+    tsc_calibrate_status = 1;
+       wmb();
+
+    while ( (inb(0x61) & 0x20) == 0 )
+        continue;
+
+    tsc_calibrate_status = 2;
+       wmb();
+
+    while ( atomic_read(&tsc_calibrate_gang) != 0 )
+        mb();
+}
+
+void calibrate_tsc_ap(void)
+{
+    u64 t1, t2, ticks_per_sec;
+
+    atomic_inc(&tsc_calibrate_gang);
+
+    while ( tsc_calibrate_status < 1 )
+        mb();
+
+    rdtscll(t1);
+
+    while ( tsc_calibrate_status < 2 )
+        mb();
+
+    rdtscll(t2);
+
+    ticks_per_sec = (t2 - t1) * (u64)CALIBRATE_FRAC;
+    set_time_scale(&cpu_time[smp_processor_id()].tsc_scale, ticks_per_sec);
+
+    atomic_dec(&tsc_calibrate_gang);
+}
+
+/* Protected by platform_timer_lock. */
+static u64 platform_pit_counter;
+static u16 pit_stamp;
+static struct ac_timer pit_overflow_timer;
+
+static u16 pit_read_counter(void)
+{
+    u16 count;
+    ASSERT(spin_is_locked(&platform_timer_lock));
+    outb(0x80, PIT_MODE);
+    count  = inb(PIT_CH2);
+    count |= inb(PIT_CH2) << 8;
+    return count;
+}
+
+static void pit_overflow(void *unused)
+{
+    u16 counter;
+
+    spin_lock(&platform_timer_lock);
+    counter = pit_read_counter();
+    platform_pit_counter += (u16)(pit_stamp - counter);
+    pit_stamp = counter;
+    spin_unlock(&platform_timer_lock);
+
+    set_ac_timer(&pit_overflow_timer, NOW() + MILLISECS(20));
+}
+
+static void init_platform_timer(void)
+{
+    init_ac_timer(&pit_overflow_timer, pit_overflow, NULL, 0);
+    pit_overflow(NULL);
+    platform_timer_stamp = platform_pit_counter;
+    set_time_scale(&platform_timer_scale, CLOCK_TICK_RATE);
+}
+
+static s_time_t __read_platform_stime(u64 platform_time)
+{
+    u64 diff64 = platform_time - platform_timer_stamp;
+    u32 diff   = down_shift(diff64, platform_timer_scale.shift);
+    ASSERT(spin_is_locked(&platform_timer_lock));
+    return (stime_platform_stamp + 
+            (u64)mul_frac(diff, platform_timer_scale.mul_frac));
+}
+
+static s_time_t read_platform_stime(void)
+{
+    u64 counter;
+    s_time_t stime;
+
+    spin_lock(&platform_timer_lock);
+    counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter());
+    stime   = __read_platform_stime(counter);
+    spin_unlock(&platform_timer_lock);
+
+    return stime;
+}
+
+static void platform_time_calibration(void)
+{
+    u64 counter;
+    s_time_t stamp;
+
+    spin_lock(&platform_timer_lock);
+    counter = platform_pit_counter + (u16)(pit_stamp - pit_read_counter());
+    stamp   = __read_platform_stime(counter);
+    stime_platform_stamp = stamp;
+    platform_timer_stamp = counter;
+    spin_unlock(&platform_timer_lock);
 }
 
 
@@ -233,140 +385,214 @@
  * System Time
  ***************************************************************************/
 
-static inline u64 get_time_delta(void)
-{
-    s32      delta_tsc;
-    u32      low;
-    u64      delta, tsc;
-
-    ASSERT(st_scale_f || st_scale_i);
+s_time_t get_s_time(void)
+{
+    struct cpu_time *t = &cpu_time[smp_processor_id()];
+    u64 tsc;
+    u32 delta;
+    s_time_t now;
 
     rdtscll(tsc);
-    low = (u32)(tsc >> rdtsc_bitshift);
-    delta_tsc = (s32)(low - shifted_tsc_irq);
-    if ( unlikely(delta_tsc < 0) ) delta_tsc = 0;
-    delta = ((u64)delta_tsc * st_scale_f);
-    delta >>= 32;
-    delta += ((u64)delta_tsc * st_scale_i);
-
-    return delta;
-}
-
-s_time_t get_s_time(void)
-{
-    s_time_t now;
-    unsigned long flags;
-
-    read_lock_irqsave(&time_lock, flags);
-
-    now = stime_irq + get_time_delta();
-
-    /* Ensure that the returned system time is monotonically increasing. */
-    {
-        static s_time_t prev_now = 0;
-        if ( unlikely(now < prev_now) )
-            now = prev_now;
-        prev_now = now;
-    }
-
-    read_unlock_irqrestore(&time_lock, flags);
-
-    return now; 
+    delta = down_shift(tsc - t->local_tsc_stamp, t->tsc_scale.shift);
+    now = t->stime_local_stamp + (u64)mul_frac(delta, t->tsc_scale.mul_frac);
+
+    return now;
 }
 
 static inline void __update_dom_time(struct vcpu *v)
 {
-    struct domain *d  = v->domain;
-    shared_info_t *si = d->shared_info;
-
-    spin_lock(&d->time_lock);
-
-    si->time_version1++;
+    struct cpu_time       *t = &cpu_time[smp_processor_id()];
+    struct vcpu_time_info *u = &v->domain->shared_info->vcpu_time[v->vcpu_id];
+
+    u->time_version1++;
     wmb();
 
-    si->cpu_freq       = cpu_freq;
-    si->tsc_timestamp  = full_tsc_irq;
-    si->system_time    = stime_irq;
-    si->wc_sec         = wc_sec;
-    si->wc_usec        = wc_usec;
+    u->tsc_timestamp     = t->local_tsc_stamp;
+    u->system_time       = t->stime_local_stamp;
+    u->tsc_to_system_mul = t->tsc_scale.mul_frac;
+    u->tsc_shift         = (s8)t->tsc_scale.shift;
 
     wmb();
-    si->time_version2++;
-
-    spin_unlock(&d->time_lock);
+    u->time_version2++;
+
+    /* Should only do this during do_settime(). */
+    v->domain->shared_info->wc_sec  = wc_sec;
+    v->domain->shared_info->wc_usec = wc_usec;
 }
 
 void update_dom_time(struct vcpu *v)
 {
-    unsigned long flags;
-
-    if ( v->domain->shared_info->tsc_timestamp != full_tsc_irq )
-    {
-        read_lock_irqsave(&time_lock, flags);
+    if ( v->domain->shared_info->vcpu_time[v->vcpu_id].tsc_timestamp != 
+         cpu_time[smp_processor_id()].local_tsc_stamp )
         __update_dom_time(v);
-        read_unlock_irqrestore(&time_lock, flags);
-    }
 }
 
 /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */
 void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base)
 {
-    s64 delta;
-    long _usecs = (long)usecs;
-
-    write_lock_irq(&time_lock);
-
-    delta = (s64)(stime_irq - system_time_base);
-
-    _usecs += (long)(delta/1000);
-    while ( _usecs >= 1000000 ) 
-    {
-        _usecs -= 1000000;
-        secs++;
-    }
-
-    wc_sec  = secs;
-    wc_usec = _usecs;
-
-    /* Others will pick up the change at the next tick. */
+    u64 x, base_usecs;
+    u32 y;
+
+    base_usecs = system_time_base;
+    do_div(base_usecs, 1000);
+
+    x = (secs * 1000000ULL) + (u64)usecs + base_usecs;
+    y = do_div(x, 1000000);
+
+    wc_sec  = (unsigned long)x;
+    wc_usec = (unsigned long)y;
+
     __update_dom_time(current);
-    send_guest_virq(current, VIRQ_TIMER);
-
-    write_unlock_irq(&time_lock);
-}
-
+}
+
+static void local_time_calibration(void *unused)
+{
+    unsigned int cpu = smp_processor_id();
+
+    /*
+     * System timestamps, extrapolated from local and master oscillators,
+     * taken during this calibration and the previous calibration.
+     */
+    s_time_t prev_local_stime, curr_local_stime;
+    s_time_t prev_master_stime, curr_master_stime;
+
+    /* TSC timestamps taken during this calibration and prev calibration. */
+    u64 prev_tsc, curr_tsc;
+
+    /*
+     * System time and TSC ticks elapsed during the previous calibration
+     * 'epoch'. Also the accumulated error in the local estimate. All these
+     * values end up down-shifted to fit in 32 bits.
+     */
+    u64 stime_elapsed64, tsc_elapsed64, local_stime_error64;
+    u32 stime_elapsed32, tsc_elapsed32, local_stime_error32;
+
+    /* Calculated TSC shift to ensure 32-bit scale multiplier. */
+    int tsc_shift = 0;
+
+    prev_tsc          = cpu_time[cpu].local_tsc_stamp;
+    prev_local_stime  = cpu_time[cpu].stime_local_stamp;
+    prev_master_stime = cpu_time[cpu].stime_master_stamp;
+
+    /* Disable IRQs to get 'instantaneous' current timestamps. */
+    local_irq_disable();
+    rdtscll(curr_tsc);
+    curr_local_stime  = get_s_time();
+    curr_master_stime = read_platform_stime();
+    local_irq_enable();
+
+#if 0
+    printk("PRE%d: tsc=%lld stime=%lld master=%lld\n",
+           cpu, prev_tsc, prev_local_stime, prev_master_stime);
+    printk("CUR%d: tsc=%lld stime=%lld master=%lld %lld\n",
+           cpu, curr_tsc, curr_local_stime, curr_master_stime,
+           platform_pit_counter);
+#endif
+
+    /* Local time warps forward if it lags behind master time. */
+    if ( curr_local_stime < curr_master_stime )
+        curr_local_stime = curr_master_stime;
+
+    stime_elapsed64 = curr_master_stime - prev_master_stime;
+    tsc_elapsed64   = curr_tsc - prev_tsc;
+
+    /*
+     * Error in the local system time estimate. Clamp to epoch time period, or
+     * we could end up with a negative scale factor (time going backwards!).
+     * This effectively clamps the scale factor to >= 0.
+     */
+    local_stime_error64 = curr_local_stime - curr_master_stime;
+    if ( local_stime_error64 > stime_elapsed64 )
+        local_stime_error64 = stime_elapsed64;
+
+    /*
+     * We require 0 < stime_elapsed < 2^31.
+     * This allows us to binary shift a 32-bit tsc_elapsed such that:
+     * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
+     */
+    while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
+            ((s32)stime_elapsed64 < 0) )
+    {
+        stime_elapsed64     >>= 1;
+        tsc_elapsed64       >>= 1;
+        local_stime_error64 >>= 1;
+    }
+
+    /* stime_master_diff (and hence stime_error) now fit in a 32-bit word. */
+    stime_elapsed32     = (u32)stime_elapsed64;
+    local_stime_error32 = (u32)local_stime_error64;
+
+    /* tsc_elapsed <= 2*stime_elapsed */
+    while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
+    {
+        tsc_elapsed64 >>= 1;
+        tsc_shift--;
+    }
+
+    /* Local difference must now fit in 32 bits. */
+    ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
+    tsc_elapsed32 = (u32)tsc_elapsed64;
+
+    /* tsc_elapsed > stime_elapsed */
+    ASSERT(tsc_elapsed32 != 0);
+    while ( tsc_elapsed32 <= stime_elapsed32 )
+    {
+        tsc_elapsed32 <<= 1;
+        tsc_shift++;
+    }
+
+#if 0
+    printk("---%d: %08x %d\n", cpu, 
+           div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32),
+           tsc_shift);
+#endif
+
+    /* Record new timestamp information. */
+    cpu_time[cpu].tsc_scale.mul_frac = 
+        div_frac(stime_elapsed32 - local_stime_error32, tsc_elapsed32);
+    cpu_time[cpu].tsc_scale.shift    = tsc_shift;
+    cpu_time[cpu].local_tsc_stamp    = curr_tsc;
+    cpu_time[cpu].stime_local_stamp  = curr_local_stime;
+    cpu_time[cpu].stime_master_stamp = curr_master_stime;
+
+    set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000));
+
+    if ( cpu == 0 )
+        platform_time_calibration();
+}
+
+void init_percpu_time(void)
+{
+    unsigned int cpu = smp_processor_id();
+    unsigned long flags;
+    s_time_t now;
+
+    local_irq_save(flags);
+    rdtscll(cpu_time[cpu].local_tsc_stamp);
+    now = (cpu == 0) ? 0 : read_platform_stime();
+    local_irq_restore(flags);
+
+    cpu_time[cpu].stime_master_stamp = now;
+    cpu_time[cpu].stime_local_stamp  = now;
+
+    init_ac_timer(&cpu_time[cpu].calibration_timer,
+                  local_time_calibration, NULL, cpu);
+    set_ac_timer(&cpu_time[cpu].calibration_timer, NOW() + MILLISECS(1000));
+}
 
 /* Late init function (after all CPUs are booted). */
-int __init init_xen_time()
-{
-    u64      scale;
-    unsigned int cpu_ghz;
-
-    cpu_ghz = (unsigned int)(cpu_freq / 1000000000ULL);
-    for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 )
-        continue;
-
-    scale  = 1000000000LL << (32 + rdtsc_bitshift);
-    scale /= cpu_freq;
-    st_scale_f = scale & 0xffffffff;
-    st_scale_i = scale >> 32;
+int __init init_xen_time(void)
+{
+    wc_sec = get_cmos_time();
 
     local_irq_disable();
 
-    /* System time ticks from zero. */
-    rdtscll(full_tsc_irq);
-    stime_irq = (s_time_t)0;
-    shifted_tsc_irq = (u32)(full_tsc_irq >> rdtsc_bitshift);
-
-    /* Wallclock time starts as the initial RTC time. */
-    wc_sec = get_cmos_time();
+    init_percpu_time();
+
+    stime_platform_stamp = 0;
+    init_platform_timer();
 
     local_irq_enable();
-
-    printk("Time init:\n");
-    printk(".... cpu_freq:    %08X:%08X\n", (u32)(cpu_freq>>32),(u32)cpu_freq);
-    printk(".... scale:       %08X:%08X\n", (u32)(scale>>32),(u32)scale);
-    printk(".... Wall Clock:  %lds %ldus\n", wc_sec, wc_usec);
 
     return 0;
 }
@@ -375,15 +601,12 @@
 /* Early init function. */
 void __init early_time_init(void)
 {
-    unsigned long ticks_per_frac = calibrate_tsc();
-
-    if ( !ticks_per_frac )
-        panic("Error calibrating TSC\n");
-
-    cpu_khz = ticks_per_frac / (1000/CALIBRATE_FRAC);
-
-    cpu_freq = (u64)ticks_per_frac * (u64)CALIBRATE_FRAC;
-
+    u64 tmp = calibrate_boot_tsc();
+
+    set_time_scale(&cpu_time[0].tsc_scale, tmp);
+
+    do_div(tmp, 1000);
+    cpu_khz = (unsigned long)tmp;
     printk("Detected %lu.%03lu MHz processor.\n", 
            cpu_khz / 1000, cpu_khz % 1000);
 
diff -r 9697bc63d403 -r 43564304cf94 xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h        Sun Jul 17 14:16:21 2005
+++ b/xen/include/asm-x86/time.h        Mon Jul 18 20:22:11 2005
@@ -4,4 +4,7 @@
 
 extern int timer_ack;
 
+extern void calibrate_tsc_bp(void);
+extern void calibrate_tsc_ap(void);
+
 #endif /* __X86_TIME_H__ */
diff -r 9697bc63d403 -r 43564304cf94 
linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c  Sun Jul 17 14:16:21 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c  Mon Jul 18 20:22:11 2005
@@ -104,24 +104,16 @@
 struct timer_opts *cur_timer = &timer_tsc;
 
 /* These are peridically updated in shared_info, and then copied here. */
-u32 shadow_tsc_stamp;
-u64 shadow_system_time;
-static u32 shadow_time_version;
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       u32 tsc_to_usec_mul;
+       int tsc_shift;
+       u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
 static struct timeval shadow_tv;
-
-/*
- * We use this to ensure that gettimeofday() is monotonically increasing. We
- * only break this guarantee if the wall clock jumps backwards "a long way".
- */
-static struct timeval last_seen_tv = {0,0};
-
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-/* Periodically propagate synchronised time base to the RTC and to Xen. */
-static long last_rtc_update, last_update_to_xen;
-#endif
-
-/* Periodically take synchronised time base from Xen, if we need it. */
-static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
 
 /* Keep track of last time we did processing/updating of jiffies and xtime. */
 static u64 processed_system_time;   /* System time (ns) at last processing. */
@@ -164,26 +156,147 @@
 #define INDEPENDENT_WALLCLOCK() \
     (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
 
+int tsc_disable __initdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+       unsigned long bclock, now;
+       
+       rdtscl(bclock);
+       do
+       {
+               rep_nop();
+               rdtscl(now);
+       } while ((now-bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+       .name = "tsc",
+       .delay = delay_tsc,
+};
+
+static inline u32 down_shift(u64 time, int shift)
+{
+       if ( shift < 0 )
+               return (u32)(time >> -shift);
+       return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+       u32 product_int, product_frac;
+       __asm__ (
+               "mul %3"
+               : "=a" (product_frac), "=d" (product_int)
+               : "0" (multiplicand), "r" (multiplier) );
+       return product_int;
+}
+
+void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0];
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       cpu_khz = down_shift(__cpu_khz, -info->tsc_shift);
+       printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_nsec_mul);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_usec_mul);
+}
+
+static void update_wallclock(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       long wtm_nsec;
+       time_t wtm_sec, sec;
+       s64 nsec;
+
+       shadow_tv.tv_sec  = s->wc_sec;
+       shadow_tv.tv_usec = s->wc_usec;
+
+       if (INDEPENDENT_WALLCLOCK())
+               return;
+
+       if ((time_status & STA_UNSYNC) != 0)
+               return;
+
+       /* Adjust shadow for jiffies that haven't updated xtime yet. */
+       shadow_tv.tv_usec -= 
+               (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
+       HANDLE_USEC_UNDERFLOW(shadow_tv);
+
+       /* Update our unsynchronised xtime appropriately. */
+       sec = shadow_tv.tv_sec;
+       nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
+
+       __normalize_time(&sec, &nsec);
+       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+       set_normalized_timespec(&xtime, sec, nsec);
+       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+}
+
 /*
  * Reads a consistent set of time-base values from Xen, into a shadow data
  * area. Must be called with the xtime_lock held for writing.
  */
 static void __get_time_values_from_xen(void)
 {
-       shared_info_t *s = HYPERVISOR_shared_info;
+       shared_info_t           *s = HYPERVISOR_shared_info;
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &s->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
 
        do {
-               shadow_time_version = s->time_version2;
+               dst->version = src->time_version2;
                rmb();
-               shadow_tv.tv_sec    = s->wc_sec;
-               shadow_tv.tv_usec   = s->wc_usec;
-               shadow_tsc_stamp    = (u32)s->tsc_timestamp;
-               shadow_system_time  = s->system_time;
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
                rmb();
        }
-       while (shadow_time_version != s->time_version1);
-
-       cur_timer->mark_offset();
+       while (dst->version != src->time_version1);
+
+       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+       if ((shadow_tv.tv_sec != s->wc_sec) ||
+           (shadow_tv.tv_usec != s->wc_usec))
+               update_wallclock();
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       return (dst->version == src->time_version2);
 }
 
 #define TIME_VALUES_UP_TO_DATE \
@@ -229,13 +342,18 @@
        unsigned long max_ntp_tick;
        unsigned long flags;
        s64 nsec;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
 
        do {
                unsigned long lost;
 
                seq = read_seqbegin(&xtime_lock);
 
-               usec = cur_timer->get_offset();
+               usec = get_usec_offset(shadow);
                lost = jiffies - wall_jiffies;
 
                /*
@@ -256,11 +374,11 @@
                sec = xtime.tv_sec;
                usec += (xtime.tv_nsec / NSEC_PER_USEC);
 
-               nsec = shadow_system_time - processed_system_time;
+               nsec = shadow->system_timestamp - processed_system_time;
                __normalize_time(&sec, &nsec);
                usec += (long)nsec / NSEC_PER_USEC;
 
-               if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
+               if (unlikely(!time_values_up_to_date(cpu))) {
                        /*
                         * We may have blocked for a long time,
                         * rendering our calculations invalid
@@ -275,19 +393,11 @@
                }
        } while (read_seqretry(&xtime_lock, seq));
 
+       put_cpu();
+
        while (usec >= USEC_PER_SEC) {
                usec -= USEC_PER_SEC;
                sec++;
-       }
-
-       /* Ensure that time-of-day is monotonically increasing. */
-       if ((sec < last_seen_tv.tv_sec) ||
-           ((sec == last_seen_tv.tv_sec) && (usec < last_seen_tv.tv_usec))) {
-               sec = last_seen_tv.tv_sec;
-               usec = last_seen_tv.tv_usec;
-       } else {
-               last_seen_tv.tv_sec = sec;
-               last_seen_tv.tv_usec = usec;
        }
 
        tv->tv_sec = sec;
@@ -302,12 +412,17 @@
        long wtm_nsec;
        s64 nsec;
        struct timespec xentime;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
 
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
 
        if (!INDEPENDENT_WALLCLOCK())
                return 0; /* Silent failure? */
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
 
        write_seqlock_irq(&xtime_lock);
 
@@ -317,9 +432,8 @@
         * be stale, so we can retry with fresh ones.
         */
  again:
-       nsec = (s64)tv->tv_nsec -
-           ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
-       if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
+       nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow);
+       if (unlikely(!time_values_up_to_date(cpu))) {
                __get_time_values_from_xen();
                goto again;
        }
@@ -335,7 +449,7 @@
         */
        nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
-       nsec -= (shadow_system_time - processed_system_time);
+       nsec -= (shadow->system_timestamp - processed_system_time);
 
        __normalize_time(&sec, &nsec);
        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
@@ -349,23 +463,20 @@
        time_maxerror = NTP_PHASE_LIMIT;
        time_esterror = NTP_PHASE_LIMIT;
 
-       /* Reset all our running time counts. They make no sense now. */
-       last_seen_tv.tv_sec = 0;
-       last_update_from_xen = 0;
-
 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
        if (xen_start_info.flags & SIF_INITDOMAIN) {
                dom0_op_t op;
-               last_rtc_update = last_update_to_xen = 0;
                op.cmd = DOM0_SETTIME;
                op.u.settime.secs        = xentime.tv_sec;
                op.u.settime.usecs       = xentime.tv_nsec / NSEC_PER_USEC;
-               op.u.settime.system_time = shadow_system_time;
+               op.u.settime.system_time = shadow->system_timestamp;
                write_sequnlock_irq(&xtime_lock);
                HYPERVISOR_dom0_op(&op);
        } else
 #endif
                write_sequnlock_irq(&xtime_lock);
+
+       put_cpu();
 
        clock_was_set();
        return 0;
@@ -403,9 +514,30 @@
  */
 unsigned long long monotonic_clock(void)
 {
-       return cur_timer->monotonic_clock();
+       int cpu = get_cpu();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       s64 off;
+       unsigned long flags;
+       
+       for ( ; ; ) {
+               off = get_nsec_offset(shadow);
+               if (time_values_up_to_date(cpu))
+                       break;
+               write_seqlock_irqsave(&xtime_lock, flags);
+               __get_time_values_from_xen();
+               write_sequnlock_irqrestore(&xtime_lock, flags);
+       }
+
+       put_cpu();
+
+       return shadow->system_timestamp + off;
 }
 EXPORT_SYMBOL(monotonic_clock);
+
+unsigned long long sched_clock(void)
+{
+       return monotonic_clock();
+}
 
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
@@ -427,27 +559,26 @@
 static inline void do_timer_interrupt(int irq, void *dev_id,
                                        struct pt_regs *regs)
 {
-       time_t wtm_sec, sec;
-       s64 delta, delta_cpu, nsec;
-       long sec_diff, wtm_nsec;
+       s64 delta, delta_cpu;
        int cpu = smp_processor_id();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
 
        do {
                __get_time_values_from_xen();
 
-               delta = delta_cpu = (s64)shadow_system_time +
-                       ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
+               delta = delta_cpu = 
+                       shadow->system_timestamp + get_nsec_offset(shadow);
                delta     -= processed_system_time;
                delta_cpu -= per_cpu(processed_system_time, cpu);
        }
-       while (!TIME_VALUES_UP_TO_DATE);
+       while (!time_values_up_to_date(cpu));
 
        if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
                printk("Timer ISR/%d: Time went backwards: "
                       "delta=%lld cpu_delta=%lld shadow=%lld "
                       "off=%lld processed=%lld cpu_processed=%lld\n",
-                      cpu, delta, delta_cpu, shadow_system_time,
-                      ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), 
+                      cpu, delta, delta_cpu, shadow->system_timestamp,
+                      (s64)get_nsec_offset(shadow),
                       processed_system_time,
                       per_cpu(processed_system_time, cpu));
                for (cpu = 0; cpu < num_online_cpus(); cpu++)
@@ -470,76 +601,6 @@
                update_process_times(user_mode(regs));
                profile_tick(CPU_PROFILING, regs);
        }
-
-       if (cpu != 0)
-               return;
-
-       /*
-        * Take synchronised time from Xen once a minute if we're not
-        * synchronised ourselves, and we haven't chosen to keep an independent
-        * time base.
-        */
-       if (!INDEPENDENT_WALLCLOCK() &&
-           ((time_status & STA_UNSYNC) != 0) &&
-           (xtime.tv_sec > (last_update_from_xen + 60))) {
-               /* Adjust shadow for jiffies that haven't updated xtime yet. */
-               shadow_tv.tv_usec -= 
-                       (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
-               HANDLE_USEC_UNDERFLOW(shadow_tv);
-
-               /*
-                * Reset our running time counts if they are invalidated by
-                * a warp backwards of more than 500ms.
-                */
-               sec_diff = xtime.tv_sec - shadow_tv.tv_sec;
-               if (unlikely(abs(sec_diff) > 1) ||
-                   unlikely(((sec_diff * USEC_PER_SEC) +
-                             (xtime.tv_nsec / NSEC_PER_USEC) -
-                             shadow_tv.tv_usec) > 500000)) {
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-                       last_rtc_update = last_update_to_xen = 0;
-#endif
-                       last_seen_tv.tv_sec = 0;
-               }
-
-               /* Update our unsynchronised xtime appropriately. */
-               sec = shadow_tv.tv_sec;
-               nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
-
-               __normalize_time(&sec, &nsec);
-               wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-               wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-               set_normalized_timespec(&xtime, sec, nsec);
-               set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-               last_update_from_xen = sec;
-       }
-
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-       if (!(xen_start_info.flags & SIF_INITDOMAIN))
-               return;
-
-       /* Send synchronised time to Xen approximately every minute. */
-       if (((time_status & STA_UNSYNC) == 0) &&
-           (xtime.tv_sec > (last_update_to_xen + 60))) {
-               dom0_op_t op;
-               struct timeval tv;
-
-               tv.tv_sec   = xtime.tv_sec;
-               tv.tv_usec  = xtime.tv_nsec / NSEC_PER_USEC;
-               tv.tv_usec += (jiffies - wall_jiffies) * (USEC_PER_SEC/HZ);
-               HANDLE_USEC_OVERFLOW(tv);
-
-               op.cmd = DOM0_SETTIME;
-               op.u.settime.secs        = tv.tv_sec;
-               op.u.settime.usecs       = tv.tv_usec;
-               op.u.settime.system_time = shadow_system_time;
-               HYPERVISOR_dom0_op(&op);
-
-               last_update_to_xen = xtime.tv_sec;
-       }
-#endif
 }
 
 /*
@@ -731,12 +792,10 @@
        xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
        set_normalized_timespec(&wall_to_monotonic,
                -xtime.tv_sec, -xtime.tv_nsec);
-       processed_system_time = shadow_system_time;
+       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
 
-       if (timer_tsc_init.init(NULL) != 0)
-               BUG();
-       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+       init_cpu_khz();
 
 #if defined(__x86_64__)
        vxtime.mode = VXTIME_TSC;
@@ -807,21 +866,15 @@
 /* No locking required. We are only CPU running, and interrupts are off. */
 void time_resume(void)
 {
-       if (timer_tsc_init.init(NULL) != 0)
-               BUG();
+       init_cpu_khz();
 
        /* Get timebases for new environment. */ 
        __get_time_values_from_xen();
 
        /* Reset our own concept of passage of system time. */
-       processed_system_time = shadow_system_time;
+       processed_system_time =
+               per_cpu(shadow_time, smp_processor_id()).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
-
-       /* Accept a warp in UTC (wall-clock) time. */
-       last_seen_tv.tv_sec = 0;
-
-       /* Make sure we resync UTC time with Xen on next timer interrupt. */
-       last_update_from_xen = 0;
 }
 
 #ifdef CONFIG_SMP
@@ -832,7 +885,8 @@
 
        do {
                seq = read_seqbegin(&xtime_lock);
-               per_cpu(processed_system_time, cpu) = shadow_system_time;
+               per_cpu(processed_system_time, cpu) = 
+                       per_cpu(shadow_time, cpu).system_timestamp;
        } while (read_seqretry(&xtime_lock, seq));
 
        per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
@@ -861,3 +915,13 @@
        return 0;
 }
 __initcall(xen_sysctl_init);
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] First cut of new time interfaces and synchronisation mechanisms., Xen patchbot -unstable <=