WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH 7/10] linux 2.6.18: time handling

To: Keir Fraser <keir@xxxxxxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH 7/10] linux 2.6.18: time handling
From: Jeremy Fitzhardinge <jeremy@xxxxxxxx>
Date: Tue, 06 Mar 2007 07:50:39 -0800
Cc: xen-devel@xxxxxxxxxxxxxxxxxxx, Jan Beulich <jbeulich@xxxxxxxxxx>
Delivery-date: Tue, 06 Mar 2007 07:49:53 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <C213031C.AAEE%keir@xxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <C213031C.AAEE%keir@xxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Thunderbird 1.5.0.9 (X11/20070212)
Keir Fraser wrote:
> I think Jeremy Fitzhardinge has an alternative clocksource patch which iirc
> is more in line with how Xen time works (should advertise a GHz frequency
> clocksource, and do scaling of the TSC value according to time-record values
> read from shared_info). Having thought about this some more I think
> clocksource support is worth getting into our tree, but let's look at both
> available patches and decide which is the better basis for further work.
>
> Jeremy: If I'm not mistaken and you do have a patch floating around, could
> you post it?
>   

Yes, there's a Xen clocksource in the pv_ops tree.  There's no nicely
separable patch, but the mechanism is pretty simple.  I've attached
arch/i386/xen/time.c

    J

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/clocksource.h>

#include <asm/xen/hypercall.h>
#include <asm/arch_hooks.h>

#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>

#include "xen-ops.h"

#define XEN_SHIFT 22

/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
static int __init __permitted_clock_jitter(char *str)
{
        permitted_clock_jitter = simple_strtoul(str, NULL, 0);
        return 1;
}
__setup("permitted_clock_jitter=", __permitted_clock_jitter);


/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
        u32 tsc_to_nsec_mul;
        int tsc_shift;
        u32 version;
};

static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);

/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time;   /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);

/* How much CPU time was spent blocked and how much was 'stolen'? */
static DEFINE_PER_CPU(u64, processed_stolen_time);
static DEFINE_PER_CPU(u64, processed_blocked_time);

/* Current runstate of each CPU (updated automatically by the hypervisor). */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);

/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)

unsigned long xen_cpu_khz(void)
{
        u64 cpu_khz = 1000000ULL << 32;
        const struct vcpu_time_info *info =
                &HYPERVISOR_shared_info->vcpu_info[0].time;

        do_div(cpu_khz, info->tsc_to_system_mul);
        if (info->tsc_shift < 0)
                cpu_khz <<= -info->tsc_shift;
        else
                cpu_khz >>= info->tsc_shift;

        return cpu_khz;
}

/*
 * Reads a consistent set of time-base values from Xen, into a shadow data
 * area.
 */
static void get_time_values_from_xen(void)
{
        struct vcpu_time_info   *src;
        struct shadow_time_info *dst;

        src = &read_pda(xen.vcpu)->time;
        dst = &get_cpu_var(shadow_time);

        do {
                dst->version = src->version;
                rmb();
                dst->tsc_timestamp     = src->tsc_timestamp;
                dst->system_timestamp  = src->system_time;
                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
                dst->tsc_shift         = src->tsc_shift;
                rmb();
        } while ((src->version & 1) | (dst->version ^ src->version));

        put_cpu_var(shadow_time);
}

static inline int time_values_up_to_date(void)
{
        struct vcpu_time_info   *src;
        unsigned dstversion;

        src = &read_pda(xen.vcpu)->time;
        dstversion = get_cpu_var(shadow_time).version;
        put_cpu_var(shadow_time);

        rmb();
        return (dstversion == src->version);
}

/*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
 */
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
        u64 product;
#ifdef __i386__
        u32 tmp1, tmp2;
#endif

        if (shift < 0)
                delta >>= -shift;
        else
                delta <<= shift;

#ifdef __i386__
        __asm__ (
                "mul  %5       ; "
                "mov  %4,%%eax ; "
                "mov  %%edx,%4 ; "
                "mul  %5       ; "
                "xor  %5,%5    ; "
                "add  %4,%%eax ; "
                "adc  %5,%%edx ; "
                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
        __asm__ (
                "mul %%rdx ; shrd $32,%%rdx,%%rax"
                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif

        return product;
}

static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
        u64 now, delta;
        rdtscll(now);
        delta = now - shadow->tsc_timestamp;
        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}


static void xen_timer_interrupt_hook(void)
{
        s64 delta, delta_cpu, stolen, blocked;
        u64 sched_time;
        int i, cpu = smp_processor_id();
        unsigned long ticks;
        struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
        struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);

        do {
                get_time_values_from_xen();

                /* Obtain a consistent snapshot of elapsed wallclock cycles. */
                delta = delta_cpu =
                        shadow->system_timestamp + get_nsec_offset(shadow);
                if (0)
                        printk("tsc_timestamp=%llu system_timestamp=%llu 
tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld 
processed_system_time=%lld\n",
                               shadow->tsc_timestamp, shadow->system_timestamp,
                               shadow->tsc_to_nsec_mul, shadow->tsc_shift,
                               shadow->version, delta, processed_system_time);

                delta     -= processed_system_time;
                delta_cpu -= __get_cpu_var(processed_system_time);

                /*
                 * Obtain a consistent snapshot of stolen/blocked cycles. We
                 * can use state_entry_time to detect if we get preempted here.
                 */
                do {
                        sched_time = runstate->state_entry_time;
                        barrier();
                        stolen = runstate->time[RUNSTATE_runnable] +
                                runstate->time[RUNSTATE_offline] -
                                __get_cpu_var(processed_stolen_time);
                        blocked = runstate->time[RUNSTATE_blocked] -
                                __get_cpu_var(processed_blocked_time);
                        barrier();
                } while (sched_time != runstate->state_entry_time);
        } while (!time_values_up_to_date());

        if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
             unlikely(delta_cpu < -(s64)permitted_clock_jitter))
            && printk_ratelimit()) {
                printk("Timer ISR/%d: Time went backwards: "
                       "delta=%lld delta_cpu=%lld shadow=%lld "
                       "off=%lld processed=%lld cpu_processed=%lld\n",
                       cpu, delta, delta_cpu, shadow->system_timestamp,
                       (s64)get_nsec_offset(shadow),
                       processed_system_time,
                       __get_cpu_var(processed_system_time));
                for (i = 0; i < num_online_cpus(); i++)
                        printk(" %d: %lld\n", i,
                               per_cpu(processed_system_time, i));
        }

        /* System-wide jiffy work. */
        ticks = 0;
        while(delta > NS_PER_TICK) {
                delta -= NS_PER_TICK;
                processed_system_time += NS_PER_TICK;
                ticks++;
        }
        do_timer(ticks);

        /*
         * Account stolen ticks.
         * HACK: Passing NULL to account_steal_time()
         * ensures that the ticks are accounted as stolen.
         */
        if ((stolen > 0) && (delta_cpu > 0)) {
                delta_cpu -= stolen;
                if (unlikely(delta_cpu < 0))
                        stolen += delta_cpu; /* clamp local-time progress */
                do_div(stolen, NS_PER_TICK);
                __get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
                __get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
                account_steal_time(NULL, (cputime_t)stolen);
        }

        /*
         * Account blocked ticks.
         * HACK: Passing idle_task to account_steal_time()
         * ensures that the ticks are accounted as idle/wait.
         */
        if ((blocked > 0) && (delta_cpu > 0)) {
                delta_cpu -= blocked;
                if (unlikely(delta_cpu < 0))
                        blocked += delta_cpu; /* clamp local-time progress */
                do_div(blocked, NS_PER_TICK);
                __get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
                __get_cpu_var(processed_system_time)  += blocked * NS_PER_TICK;
                account_steal_time(idle_task(cpu), (cputime_t)blocked);
        }

        update_process_times(user_mode_vm(get_irq_regs()));
}

static cycle_t xen_clocksource_read(void)
{
        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
        cycle_t ret;

        get_time_values_from_xen();

        ret = shadow->system_timestamp + get_nsec_offset(shadow);

        put_cpu_var(shadow_time);

        return ret;
}

static void xen_read_wallclock(struct timespec *ts)
{
        const struct shared_info *s = HYPERVISOR_shared_info;
        u32 version;
        u64 delta;
        struct timespec now;

        /* get wallclock at system boot */
        do {
                version = s->wc_version;
                rmb();
                now.tv_sec  = s->wc_sec;
                now.tv_nsec = s->wc_nsec;
                rmb();
        } while ((s->wc_version & 1) | (version ^ s->wc_version));

        delta = xen_clocksource_read(); /* time since system boot */
        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;

        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
        now.tv_sec = delta;

        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}

unsigned long xen_get_wallclock(void)
{
        struct timespec ts;

        xen_read_wallclock(&ts);

        return ts.tv_sec;
}

int xen_set_wallclock(unsigned long now)
{
        /* do nothing for domU */
        return -1;
}

static struct clocksource xen_clocksource = {
        .name = "xen",
        .rating = 400,
        .read = xen_clocksource_read,
        .mask = ~0,
        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
        .shift = XEN_SHIFT,
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};

static void init_missing_ticks_accounting(int cpu)
{
        struct vcpu_register_runstate_memory_area area;
        struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);

        memset(runstate, 0, sizeof(*runstate));

        area.addr.v = runstate;
        HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);

        per_cpu(processed_blocked_time, cpu) =
                runstate->time[RUNSTATE_blocked];
        per_cpu(processed_stolen_time, cpu) =
                runstate->time[RUNSTATE_runnable] +
                runstate->time[RUNSTATE_offline];
}

static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
        /*
         * Here we are in the timer irq handler. We just have irqs locally
         * disabled but we don't know if the timer_bh is running on the other
         * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
         * the irq version of write_lock because as just said we have irq
         * locally disabled. -arca
         */
        write_seqlock(&xtime_lock);

        xen_timer_interrupt_hook();

        write_sequnlock(&xtime_lock);

        return IRQ_HANDLED;
}

static void setup_cpu0_timer_irq(void)
{
        printk(KERN_DEBUG "installing Xen timer for CPU 0\n");

        bind_virq_to_irqhandler(
                VIRQ_TIMER,
                0,
                xen_timer_interrupt,
                SA_INTERRUPT,
                "timer0",
                NULL);
}

__init void xen_time_init(void)
{
        get_time_values_from_xen();

        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;

        init_missing_ticks_accounting(0);

        clocksource_register(&xen_clocksource);

        /* Set initial system time with full resolution */
        xen_read_wallclock(&xtime);
        set_normalized_timespec(&wall_to_monotonic,
                                -xtime.tv_sec, -xtime.tv_nsec);

        tsc_disable = 0;

        setup_cpu0_timer_irq();
}

/* Convert jiffies to system time. */
static u64 jiffies_to_st(unsigned long j)
{
        unsigned long seq;
        long delta;
        u64 st;

        do {
                seq = read_seqbegin(&xtime_lock);
                delta = j - jiffies;
                if (delta < 1) {
                        /* Triggers in some wrap-around cases, but that's okay:
                         * we just end up with a shorter timeout. */
                        st = processed_system_time + NS_PER_TICK;
                } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
                        /* Very long timeout means there is no pending timer.
                         * We indicate this to Xen by passing zero timeout. */
                        st = 0;
                } else {
                        st = processed_system_time + delta * (u64)NS_PER_TICK;
                }
        } while (read_seqretry(&xtime_lock, seq));

        return st;
}

/*
 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
 * These functions are based on implementations from arch/s390/kernel/time.c
 */
void stop_hz_timer(void)
{
        unsigned int cpu = smp_processor_id();
        unsigned long j;

        cpu_set(cpu, nohz_cpu_mask);

        /*
         * See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs
         * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
         * value of rcp->cur that matches rdp->quiescbatch and allows us to
         * stop the hz timer then the cpumasks created for subsequent values
         * of cur in rcu_start_batch are guaranteed to pick up the updated
         * nohz_cpu_mask and so will not depend on this cpu.
         */

        smp_mb();

        /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
        if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
            (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
                cpu_clear(cpu, nohz_cpu_mask);
                j = jiffies + 1;
        }

        if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
                BUG();
}

void start_hz_timer(void)
{
        cpu_clear(smp_processor_id(), nohz_cpu_mask);
}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel