Keir Fraser wrote:
> I think Jeremy Fitzhardinge has an alternative clocksource patch which iirc
> is more in line with how Xen time works (should advertise a GHz frequency
> clocksource, and do scaling of the TSC value according to time-record values
> read from shared_info). Having thought about this some more I think
> clocksource support is worth getting into our tree, but let's look at both
> available patches and decide which is the better basis for further work.
>
> Jeremy: If I'm not mistaken and you do have a patch floating around, could
> you post it?
>
Yes, there's a Xen clocksource in the pv_ops tree. There's no nicely
separable patch, but the mechanism is pretty simple. I've attached
arch/i386/xen/time.c
J
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/clocksource.h>
#include <asm/xen/hypercall.h>
#include <asm/arch_hooks.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
#define XEN_SHIFT 22
/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
static int __init __permitted_clock_jitter(char *str)
{
permitted_clock_jitter = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("permitted_clock_jitter=", __permitted_clock_jitter);
/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time; /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);
/* How much CPU time was spent blocked and how much was 'stolen'? */
static DEFINE_PER_CPU(u64, processed_stolen_time);
static DEFINE_PER_CPU(u64, processed_blocked_time);
/* Current runstate of each CPU (updated automatically by the hypervisor). */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)
unsigned long xen_cpu_khz(void)
{
u64 cpu_khz = 1000000ULL << 32;
const struct vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(cpu_khz, info->tsc_to_system_mul);
if (info->tsc_shift < 0)
cpu_khz <<= -info->tsc_shift;
else
cpu_khz >>= info->tsc_shift;
return cpu_khz;
}
/*
* Reads a consistent set of time-base values from Xen, into a shadow data
* area.
*/
static void get_time_values_from_xen(void)
{
struct vcpu_time_info *src;
struct shadow_time_info *dst;
src = &read_pda(xen.vcpu)->time;
dst = &get_cpu_var(shadow_time);
do {
dst->version = src->version;
rmb();
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb();
} while ((src->version & 1) | (dst->version ^ src->version));
put_cpu_var(shadow_time);
}
static inline int time_values_up_to_date(void)
{
struct vcpu_time_info *src;
unsigned dstversion;
src = &read_pda(xen.vcpu)->time;
dstversion = get_cpu_var(shadow_time).version;
put_cpu_var(shadow_time);
rmb();
return (dstversion == src->version);
}
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
u64 now, delta;
rdtscll(now);
delta = now - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
static void xen_timer_interrupt_hook(void)
{
s64 delta, delta_cpu, stolen, blocked;
u64 sched_time;
int i, cpu = smp_processor_id();
unsigned long ticks;
struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);
do {
get_time_values_from_xen();
/* Obtain a consistent snapshot of elapsed wallclock cycles. */
delta = delta_cpu =
shadow->system_timestamp + get_nsec_offset(shadow);
if (0)
printk("tsc_timestamp=%llu system_timestamp=%llu
tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld
processed_system_time=%lld\n",
shadow->tsc_timestamp, shadow->system_timestamp,
shadow->tsc_to_nsec_mul, shadow->tsc_shift,
shadow->version, delta, processed_system_time);
delta -= processed_system_time;
delta_cpu -= __get_cpu_var(processed_system_time);
/*
* Obtain a consistent snapshot of stolen/blocked cycles. We
* can use state_entry_time to detect if we get preempted here.
*/
do {
sched_time = runstate->state_entry_time;
barrier();
stolen = runstate->time[RUNSTATE_runnable] +
runstate->time[RUNSTATE_offline] -
__get_cpu_var(processed_stolen_time);
blocked = runstate->time[RUNSTATE_blocked] -
__get_cpu_var(processed_blocked_time);
barrier();
} while (sched_time != runstate->state_entry_time);
} while (!time_values_up_to_date());
if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
unlikely(delta_cpu < -(s64)permitted_clock_jitter))
&& printk_ratelimit()) {
printk("Timer ISR/%d: Time went backwards: "
"delta=%lld delta_cpu=%lld shadow=%lld "
"off=%lld processed=%lld cpu_processed=%lld\n",
cpu, delta, delta_cpu, shadow->system_timestamp,
(s64)get_nsec_offset(shadow),
processed_system_time,
__get_cpu_var(processed_system_time));
for (i = 0; i < num_online_cpus(); i++)
printk(" %d: %lld\n", i,
per_cpu(processed_system_time, i));
}
/* System-wide jiffy work. */
ticks = 0;
while(delta > NS_PER_TICK) {
delta -= NS_PER_TICK;
processed_system_time += NS_PER_TICK;
ticks++;
}
do_timer(ticks);
/*
* Account stolen ticks.
* HACK: Passing NULL to account_steal_time()
* ensures that the ticks are accounted as stolen.
*/
if ((stolen > 0) && (delta_cpu > 0)) {
delta_cpu -= stolen;
if (unlikely(delta_cpu < 0))
stolen += delta_cpu; /* clamp local-time progress */
do_div(stolen, NS_PER_TICK);
__get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
__get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
account_steal_time(NULL, (cputime_t)stolen);
}
/*
* Account blocked ticks.
* HACK: Passing idle_task to account_steal_time()
* ensures that the ticks are accounted as idle/wait.
*/
if ((blocked > 0) && (delta_cpu > 0)) {
delta_cpu -= blocked;
if (unlikely(delta_cpu < 0))
blocked += delta_cpu; /* clamp local-time progress */
do_div(blocked, NS_PER_TICK);
__get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
__get_cpu_var(processed_system_time) += blocked * NS_PER_TICK;
account_steal_time(idle_task(cpu), (cputime_t)blocked);
}
update_process_times(user_mode_vm(get_irq_regs()));
}
static cycle_t xen_clocksource_read(void)
{
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
cycle_t ret;
get_time_values_from_xen();
ret = shadow->system_timestamp + get_nsec_offset(shadow);
put_cpu_var(shadow_time);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
const struct shared_info *s = HYPERVISOR_shared_info;
u32 version;
u64 delta;
struct timespec now;
/* get wallclock at system boot */
do {
version = s->wc_version;
rmb();
now.tv_sec = s->wc_sec;
now.tv_nsec = s->wc_nsec;
rmb();
} while ((s->wc_version & 1) | (version ^ s->wc_version));
delta = xen_clocksource_read(); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
}
static struct clocksource xen_clocksource = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_read,
.mask = ~0,
.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
.shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void init_missing_ticks_accounting(int cpu)
{
struct vcpu_register_runstate_memory_area area;
struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
memset(runstate, 0, sizeof(*runstate));
area.addr.v = runstate;
HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
per_cpu(processed_blocked_time, cpu) =
runstate->time[RUNSTATE_blocked];
per_cpu(processed_stolen_time, cpu) =
runstate->time[RUNSTATE_runnable] +
runstate->time[RUNSTATE_offline];
}
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
/*
* Here we are in the timer irq handler. We just have irqs locally
* disabled but we don't know if the timer_bh is running on the other
* CPU. We need to avoid to SMP race with it. NOTE: we don' t need
* the irq version of write_lock because as just said we have irq
* locally disabled. -arca
*/
write_seqlock(&xtime_lock);
xen_timer_interrupt_hook();
write_sequnlock(&xtime_lock);
return IRQ_HANDLED;
}
static void setup_cpu0_timer_irq(void)
{
printk(KERN_DEBUG "installing Xen timer for CPU 0\n");
bind_virq_to_irqhandler(
VIRQ_TIMER,
0,
xen_timer_interrupt,
SA_INTERRUPT,
"timer0",
NULL);
}
__init void xen_time_init(void)
{
get_time_values_from_xen();
processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
per_cpu(processed_system_time, 0) = processed_system_time;
init_missing_ticks_accounting(0);
clocksource_register(&xen_clocksource);
/* Set initial system time with full resolution */
xen_read_wallclock(&xtime);
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
tsc_disable = 0;
setup_cpu0_timer_irq();
}
/* Convert jiffies to system time. */
static u64 jiffies_to_st(unsigned long j)
{
unsigned long seq;
long delta;
u64 st;
do {
seq = read_seqbegin(&xtime_lock);
delta = j - jiffies;
if (delta < 1) {
/* Triggers in some wrap-around cases, but that's okay:
* we just end up with a shorter timeout. */
st = processed_system_time + NS_PER_TICK;
} else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
/* Very long timeout means there is no pending timer.
* We indicate this to Xen by passing zero timeout. */
st = 0;
} else {
st = processed_system_time + delta * (u64)NS_PER_TICK;
}
} while (read_seqretry(&xtime_lock, seq));
return st;
}
/*
* stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
* These functions are based on implementations from arch/s390/kernel/time.c
*/
void stop_hz_timer(void)
{
unsigned int cpu = smp_processor_id();
unsigned long j;
cpu_set(cpu, nohz_cpu_mask);
/*
* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs
* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
* value of rcp->cur that matches rdp->quiescbatch and allows us to
* stop the hz timer then the cpumasks created for subsequent values
* of cur in rcu_start_batch are guaranteed to pick up the updated
* nohz_cpu_mask and so will not depend on this cpu.
*/
smp_mb();
/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
(j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
cpu_clear(cpu, nohz_cpu_mask);
j = jiffies + 1;
}
if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
BUG();
}
void start_hz_timer(void)
{
cpu_clear(smp_processor_id(), nohz_cpu_mask);
}
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|