Re: [Xen-devel] [PATCH 7/10] linux 2.6.18: time handling

Keir Fraser wrote:
> I think Jeremy Fitzhardinge has an alternative clocksource patch which iirc
> is more in line with how Xen time works (should advertise a GHz frequency
> clocksource, and do scaling of the TSC value according to time-record values
> read from shared_info). Having thought about this some more I think
> clocksource support is worth getting into our tree, but let's look at both
> available patches and decide which is the better basis for further work.
>
> Jeremy: If I'm not mistaken and you do have a patch floating around, could
> you post it?
>   

Yes, there's a Xen clocksource in the pv_ops tree.  There's no nicely
separable patch, but the mechanism is pretty simple.  I've attached
arch/i386/xen/time.c

    J

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/clocksource.h>

#include <asm/xen/hypercall.h>
#include <asm/arch_hooks.h>

#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>

#include "xen-ops.h"

#define XEN_SHIFT 22

/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
static int __init __permitted_clock_jitter(char *str)
{
        permitted_clock_jitter = simple_strtoul(str, NULL, 0);
        return 1;
}
__setup("permitted_clock_jitter=", __permitted_clock_jitter);


/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
        u32 tsc_to_nsec_mul;
        int tsc_shift;
        u32 version;
};

static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);

/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time;   /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);

/* How much CPU time was spent blocked and how much was 'stolen'? */
static DEFINE_PER_CPU(u64, processed_stolen_time);
static DEFINE_PER_CPU(u64, processed_blocked_time);

/* Current runstate of each CPU (updated automatically by the hypervisor). */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);

/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)

unsigned long xen_cpu_khz(void)
{
        u64 cpu_khz = 1000000ULL << 32;
        const struct vcpu_time_info *info =
                &HYPERVISOR_shared_info->vcpu_info[0].time;

        do_div(cpu_khz, info->tsc_to_system_mul);
        if (info->tsc_shift < 0)
                cpu_khz <<= -info->tsc_shift;
        else
                cpu_khz >>= info->tsc_shift;

        return cpu_khz;
}

/*
 * Reads a consistent set of time-base values from Xen, into a shadow data
 * area.
 */
static void get_time_values_from_xen(void)
{
        struct vcpu_time_info   *src;
        struct shadow_time_info *dst;

        src = &read_pda(xen.vcpu)->time;
        dst = &get_cpu_var(shadow_time);

        do {
                dst->version = src->version;
                rmb();
                dst->tsc_timestamp     = src->tsc_timestamp;
                dst->system_timestamp  = src->system_time;
                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
                dst->tsc_shift         = src->tsc_shift;
                rmb();
        } while ((src->version & 1) | (dst->version ^ src->version));

        put_cpu_var(shadow_time);
}

static inline int time_values_up_to_date(void)
{
        struct vcpu_time_info   *src;
        unsigned dstversion;

        src = &read_pda(xen.vcpu)->time;
        dstversion = get_cpu_var(shadow_time).version;
        put_cpu_var(shadow_time);

        rmb();
        return (dstversion == src->version);
}

/*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
 */
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
        u64 product;
#ifdef __i386__
        u32 tmp1, tmp2;
#endif

        if (shift < 0)
                delta >>= -shift;
        else
                delta <<= shift;

#ifdef __i386__
        __asm__ (
                "mul  %5       ; "
                "mov  %4,%%eax ; "
                "mov  %%edx,%4 ; "
                "mul  %5       ; "
                "xor  %5,%5    ; "
                "add  %4,%%eax ; "
                "adc  %5,%%edx ; "
                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
        __asm__ (
                "mul %%rdx ; shrd $32,%%rdx,%%rax"
                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif

        return product;
}

static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
        u64 now, delta;
        rdtscll(now);
        delta = now - shadow->tsc_timestamp;
        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}


static void xen_timer_interrupt_hook(void)
{
        s64 delta, delta_cpu, stolen, blocked;
        u64 sched_time;
        int i, cpu = smp_processor_id();
        unsigned long ticks;
        struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
        struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);

        do {
                get_time_values_from_xen();

                /* Obtain a consistent snapshot of elapsed wallclock cycles. */
                delta = delta_cpu =
                        shadow->system_timestamp + get_nsec_offset(shadow);
                if (0)
                        printk("tsc_timestamp=%llu system_timestamp=%llu 
tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld 
processed_system_time=%lld\n",
                               shadow->tsc_timestamp, shadow->system_timestamp,
                               shadow->tsc_to_nsec_mul, shadow->tsc_shift,
                               shadow->version, delta, processed_system_time);

                delta     -= processed_system_time;
                delta_cpu -= __get_cpu_var(processed_system_time);

                /*
                 * Obtain a consistent snapshot of stolen/blocked cycles. We
                 * can use state_entry_time to detect if we get preempted here.
                 */
                do {
                        sched_time = runstate->state_entry_time;
                        barrier();
                        stolen = runstate->time[RUNSTATE_runnable] +
                                runstate->time[RUNSTATE_offline] -
                                __get_cpu_var(processed_stolen_time);
                        blocked = runstate->time[RUNSTATE_blocked] -
                                __get_cpu_var(processed_blocked_time);
                        barrier();
                } while (sched_time != runstate->state_entry_time);
        } while (!time_values_up_to_date());

        if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
             unlikely(delta_cpu < -(s64)permitted_clock_jitter))
            && printk_ratelimit()) {
                printk("Timer ISR/%d: Time went backwards: "
                       "delta=%lld delta_cpu=%lld shadow=%lld "
                       "off=%lld processed=%lld cpu_processed=%lld\n",
                       cpu, delta, delta_cpu, shadow->system_timestamp,
                       (s64)get_nsec_offset(shadow),
                       processed_system_time,
                       __get_cpu_var(processed_system_time));
                for (i = 0; i < num_online_cpus(); i++)
                        printk(" %d: %lld\n", i,
                               per_cpu(processed_system_time, i));
        }

        /* System-wide jiffy work. */
        ticks = 0;
        while(delta > NS_PER_TICK) {
                delta -= NS_PER_TICK;
                processed_system_time += NS_PER_TICK;
                ticks++;
        }
        do_timer(ticks);

        /*
         * Account stolen ticks.
         * HACK: Passing NULL to account_steal_time()
         * ensures that the ticks are accounted as stolen.
         */
        if ((stolen > 0) && (delta_cpu > 0)) {
                delta_cpu -= stolen;
                if (unlikely(delta_cpu < 0))
                        stolen += delta_cpu; /* clamp local-time progress */
                do_div(stolen, NS_PER_TICK);
                __get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
                __get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
                account_steal_time(NULL, (cputime_t)stolen);
        }

        /*
         * Account blocked ticks.
         * HACK: Passing idle_task to account_steal_time()
         * ensures that the ticks are accounted as idle/wait.
         */
        if ((blocked > 0) && (delta_cpu > 0)) {
                delta_cpu -= blocked;
                if (unlikely(delta_cpu < 0))
                        blocked += delta_cpu; /* clamp local-time progress */
                do_div(blocked, NS_PER_TICK);
                __get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
                __get_cpu_var(processed_system_time)  += blocked * NS_PER_TICK;
                account_steal_time(idle_task(cpu), (cputime_t)blocked);
        }

        update_process_times(user_mode_vm(get_irq_regs()));
}

static cycle_t xen_clocksource_read(void)
{
        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
        cycle_t ret;

        get_time_values_from_xen();

        ret = shadow->system_timestamp + get_nsec_offset(shadow);

        put_cpu_var(shadow_time);

        return ret;
}

static void xen_read_wallclock(struct timespec *ts)
{
        const struct shared_info *s = HYPERVISOR_shared_info;
        u32 version;
        u64 delta;
        struct timespec now;

        /* get wallclock at system boot */
        do {
                version = s->wc_version;
                rmb();
                now.tv_sec  = s->wc_sec;
                now.tv_nsec = s->wc_nsec;
                rmb();
        } while ((s->wc_version & 1) | (version ^ s->wc_version));

        delta = xen_clocksource_read(); /* time since system boot */
        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;

        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
        now.tv_sec = delta;

        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}

unsigned long xen_get_wallclock(void)
{
        struct timespec ts;

        xen_read_wallclock(&ts);

        return ts.tv_sec;
}

int xen_set_wallclock(unsigned long now)
{
        /* do nothing for domU */
        return -1;
}

static struct clocksource xen_clocksource = {
        .name = "xen",
        .rating = 400,
        .read = xen_clocksource_read,
        .mask = ~0,
        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
        .shift = XEN_SHIFT,
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};

static void init_missing_ticks_accounting(int cpu)
{
        struct vcpu_register_runstate_memory_area area;
        struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);

        memset(runstate, 0, sizeof(*runstate));

        area.addr.v = runstate;
        HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);

        per_cpu(processed_blocked_time, cpu) =
                runstate->time[RUNSTATE_blocked];
        per_cpu(processed_stolen_time, cpu) =
                runstate->time[RUNSTATE_runnable] +
                runstate->time[RUNSTATE_offline];
}

static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
        /*
         * Here we are in the timer irq handler. We just have irqs locally
         * disabled but we don't know if the timer_bh is running on the other
         * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
         * the irq version of write_lock because as just said we have irq
         * locally disabled. -arca
         */
        write_seqlock(&xtime_lock);

        xen_timer_interrupt_hook();

        write_sequnlock(&xtime_lock);

        return IRQ_HANDLED;
}

static void setup_cpu0_timer_irq(void)
{
        printk(KERN_DEBUG "installing Xen timer for CPU 0\n");

        bind_virq_to_irqhandler(
                VIRQ_TIMER,
                0,
                xen_timer_interrupt,
                SA_INTERRUPT,
                "timer0",
                NULL);
}

__init void xen_time_init(void)
{
        get_time_values_from_xen();

        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;

        init_missing_ticks_accounting(0);

        clocksource_register(&xen_clocksource);

        /* Set initial system time with full resolution */
        xen_read_wallclock(&xtime);
        set_normalized_timespec(&wall_to_monotonic,
                                -xtime.tv_sec, -xtime.tv_nsec);

        tsc_disable = 0;

        setup_cpu0_timer_irq();
}

/* Convert jiffies to system time. */
static u64 jiffies_to_st(unsigned long j)
{
        unsigned long seq;
        long delta;
        u64 st;

        do {
                seq = read_seqbegin(&xtime_lock);
                delta = j - jiffies;
                if (delta < 1) {
                        /* Triggers in some wrap-around cases, but that's okay:
                         * we just end up with a shorter timeout. */
                        st = processed_system_time + NS_PER_TICK;
                } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
                        /* Very long timeout means there is no pending timer.
                         * We indicate this to Xen by passing zero timeout. */
                        st = 0;
                } else {
                        st = processed_system_time + delta * (u64)NS_PER_TICK;
                }
        } while (read_seqretry(&xtime_lock, seq));

        return st;
}

/*
 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
 * These functions are based on implementations from arch/s390/kernel/time.c
 */
void stop_hz_timer(void)
{
        unsigned int cpu = smp_processor_id();
        unsigned long j;

        cpu_set(cpu, nohz_cpu_mask);

        /*
         * See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs
         * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
         * value of rcp->cur that matches rdp->quiescbatch and allows us to
         * stop the hz timer then the cpumasks created for subsequent values
         * of cur in rcu_start_batch are guaranteed to pick up the updated
         * nohz_cpu_mask and so will not depend on this cpu.
         */

        smp_mb();

        /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
        if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
            (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
                cpu_clear(cpu, nohz_cpu_mask);
                j = jiffies + 1;
        }

        if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
                BUG();
}

void start_hz_timer(void)
{
        cpu_clear(smp_processor_id(), nohz_cpu_mask);
}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-devel] [PATCH 7/10] linux 2.6.18: time handling