/******************************************************************************
 * tools/xenbaked.c
 *
 * Tool for collecting raw trace buffer data from Xen and 
 *  performing some accumulation operations and other processing
 *  on it.
 *
 * Copyright (C) 2004 by Intel Research Cambridge
 * Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins
 *
 * Authors: Diwaker Gupta, diwaker.gupta@hp.com
 *          Rob Gardner, rob.gardner@hp.com
 *          Lucy Cherkasova, lucy.cherkasova.hp.com
 * Much code based on xentrace, authored by Mark Williamson, mark.a.williamson@intel.com
 * Date:   August, 2005
 * 
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <argp.h>
#include <signal.h>
#include <xc.h>
#include <xen/xen.h>
#include <string.h>

typedef struct { int counter; } atomic_t;
#define _atomic_read(v)		((v).counter)

#include <xen/trace.h>
#include "xenbaked.h"

extern FILE *stderr;

/***** Compile time configuration of defaults ********************************/

/* when we've got more records than this waiting, we log it to the output */
#define NEW_DATA_THRESH 1

/* sleep for this long (milliseconds) between checking the trace buffers */
#define POLL_SLEEP_MILLIS 100

/* Size of time period represented by each sample */
#define MS_PER_SAMPLE 100

/* CPU Frequency */
#define MHZ
#define CPU_FREQ 2660 MHZ

/***** The code **************************************************************/

typedef struct settings_st {
    char *outfile;
    struct timespec poll_sleep;
    unsigned long new_data_thresh;
    unsigned long ms_per_sample;
    double cpu_freq;
} settings_t;

settings_t opts;

int interrupted = 0; /* gets set if we get a SIGHUP */
long seq_ok = 0;
long seq_bad = 0;
long timewarps = 0;
int current = 0;		// currently running domain
int rec_count = 0;
time_t start_time;

_new_qos_data *new_qos;

#define ID(X) ((X>NDOMAINS-1)?(NDOMAINS-1):X)

                                           void close_handler(int signal)
{
    interrupted = 1;
}

void dump_record(int cpu, struct t_rec *x)
{
    printf("record: cpu=%x, tsc=%llx, event=%x, d1=%lx\n", 
            cpu, x->cycles, x->event, x->data[0]);
}


/**
 * millis_to_timespec - convert a time in milliseconds to a struct timespec
 * @millis:             time interval in milliseconds
 */
struct timespec millis_to_timespec(unsigned long millis)
{
    struct timespec spec;

    spec.tv_sec = millis / 1000;
    spec.tv_nsec = (millis % 1000) * 1000;

    return spec;
}


typedef struct 
{
    int event_count;
    int event_id;
    char *text;
} stat_map_t;

stat_map_t stat_map[] = {
    { 0,       0, 	    "Other" },
    { 0, TRC_SCHED_DOM_ADD, "Add Domain" },
    { 0, TRC_SCHED_DOM_REM, "Remove Domain" },
    { 0, TRC_SCHED_SLEEP, "Sleep" },
    { 0, TRC_SCHED_WAKE,  "Wake" },
    { 0, TRC_SCHED_BLOCK,  "Block" },
    { 0, TRC_SCHED_SWITCH,  "Switch" },
    { 0, TRC_SCHED_S_TIMER_FN, "Timer Func"},
    { 0, TRC_SCHED_SWITCH_INFPREV,  "Switch Prev" },
    { 0, TRC_SCHED_SWITCH_INFNEXT,  "Switch Next" },
    { 0, TRC_MEM_PAGE_FLIP,  "Page Exchange" },
    { 0, TRC_SCHED_BVT_INFO, "BVT Info"},
    { 0,      0, 		 0  }
};

typedef struct 
{
    int event_count;
    int event_id;
} event_count_t;

#define NEVENTS 100
event_count_t event_map[NEVENTS];

void dump_event_map(void)
{
    int i;

    for (i=0; i<NEVENTS; i++) {
        if (event_map[i].event_id != 0)
            printf("event_id 0x%x count=%d\n", 
                    event_map[i].event_id,
                    event_map[i].event_count);
    }
}


void dump_stats(void) 
{
    stat_map_t *smt = stat_map;
    time_t end_time, run_time;


    while (smt->text != NULL) {
        printf("%08d\t%s\n", smt->event_count, smt->text);
        smt++;
    }
    printf("records in sequence: %ld \n", seq_ok);
    printf("records out of seq:  %ld \n", seq_bad);
    printf("time warps: %ld\n", timewarps);
    time(&end_time);

    run_time = end_time - start_time;

    printf("processed %d total records in %d seconds, which is %ld per second\n",
            rec_count, (int)run_time, rec_count/run_time);
    dump_event_map();
}

void log_event(int event_id) 
{
    stat_map_t *smt = stat_map;
    static int event_map_init_done = 0;
    int i;

    if (!event_map_init_done) {
        event_map_init_done = 1;
        for (i=0; i<NEVENTS; i++) {
            event_map[i].event_count = 0;
            event_map[i].event_id = 0;
        }
    }


    //  printf("event_id = 0x%x\n", event_id);

    while (smt->text != NULL) {
        if (smt->event_id == event_id) {
            smt->event_count++;
            return;
        }
        smt++;
    }
    if (smt->text == NULL)
        stat_map[0].event_count++;	// other

    // for unknown events, keep track separately
    for (i=0; i<NEVENTS; i++) {
        if (event_map[i].event_id == event_id) {
            event_map[i].event_count++;
            break;
        }
        if (event_map[i].event_id == 0) {
            event_map[i].event_id = event_id;
            event_map[i].event_count = 1;
            break;
        }
    }
}

/**
 * get_tbufs - get pointer to and size of the trace buffers
 * @mach_addr: location to store machine address if the trace buffers to
 * @size:      location to store the size of a trace buffer to
 *
 * Gets the machine address of the trace pointer area and the size of the
 * per CPU buffers.
 */
void get_tbufs(unsigned long *mach_addr, unsigned long *size)
{
    int ret;
    dom0_op_t op;                        /* dom0 op we'll build             */
    int xc_handle = xc_interface_open(); /* for accessing control interface */

    op.cmd = DOM0_TBUFCONTROL;
    op.interface_version = DOM0_INTERFACE_VERSION;
    op.u.tbufcontrol.op  = DOM0_TBUF_GET_INFO;

    ret = xc_dom0_op(xc_handle, &op);

    xc_interface_close(xc_handle);

    if ( ret != 0 )
    {
        perror("Failure to get trace buffer pointer from Xen");
        exit(EXIT_FAILURE);
    }

    *mach_addr = op.u.tbufcontrol.mach_addr;
    *size      = op.u.tbufcontrol.size;
}

/**
 * map_tbufs - memory map Xen trace buffers into user space
 * @tbufs:     machine address of the trace buffers
 * @num:       number of trace buffers to map
 * @size:      size of each trace buffer
 *
 * Maps the Xen trace buffers them into process address space.
 */
struct t_buf *map_tbufs(unsigned long tbufs_mach, unsigned int num,
        unsigned long size)
{
    int xc_handle;                  /* file descriptor for /proc/xen/privcmd */
    struct t_buf *tbufs_mapped;

    xc_handle = xc_interface_open();

    if ( xc_handle < 0 ) 
    {
        perror("Open /proc/xen/privcmd when mapping trace buffers\n");
        exit(EXIT_FAILURE);
    }

    tbufs_mapped = xc_map_foreign_range(xc_handle, 0 /* Dom 0 ID */,
            size * num, PROT_READ,
            tbufs_mach >> XC_PAGE_SHIFT);

    xc_interface_close(xc_handle);

    if ( tbufs_mapped == 0 ) 
    {
        perror("Failed to mmap trace buffers");
        exit(EXIT_FAILURE);
    }

    return tbufs_mapped;
}


/**
 * init_bufs_ptrs - initialises an array of pointers to the trace buffers
 * @bufs_mapped:    the userspace address where the trace buffers are mapped
 * @num:            number of trace buffers
 * @size:           trace buffer size
 *
 * Initialises an array of pointers to individual trace buffers within the
 * mapped region containing all trace buffers.
 */
struct t_buf **init_bufs_ptrs(void *bufs_mapped, unsigned int num,
        unsigned long size)
{
    int i;
    struct t_buf **user_ptrs;

    user_ptrs = (struct t_buf **)calloc(num, sizeof(struct t_buf *));
    if ( user_ptrs == NULL )
    {
        perror( "Failed to allocate memory for buffer pointers\n");
        exit(EXIT_FAILURE);
    }

    /* initialise pointers to the trace buffers - given the size of a trace
     * buffer and the value of bufs_maped, we can easily calculate these */
    for ( i = 0; i<num; i++ )
        user_ptrs[i] = (struct t_buf *)((unsigned long)bufs_mapped + size * i);

    return user_ptrs;
}


/**
 * init_rec_ptrs - initialises data area pointers to locations in user space
 * @tbufs_mach:    machine base address of the trace buffer area
 * @tbufs_mapped:  user virtual address of base of trace buffer area
 * @meta:          array of user-space pointers to struct t_buf's of metadata
 * @num:           number of trace buffers
 *
 * Initialises data area pointers to the locations that data areas have been
 * mapped in user space.  Note that the trace buffer metadata contains machine
 * pointers - the array returned allows more convenient access to them.
 */
struct t_rec **init_rec_ptrs(unsigned long tbufs_mach,
        struct t_buf *tbufs_mapped,
        struct t_buf **meta,
        unsigned int num)
{
    int i;
    struct t_rec **data;

    data = calloc(num, sizeof(struct t_rec *));
    if ( data == NULL )
    {
        perror("Failed to allocate memory for data pointers\n");
        exit(EXIT_FAILURE);
    }

    for ( i = 0; i < num; i++ )
        data[i] = (struct t_rec *)(meta[i]->rec_addr - tbufs_mach
                + (unsigned long)tbufs_mapped);

    return data;
}

/**
 * init_tail_idxs - initialise an array of tail indexes
 * @bufs:           array of pointers to trace buffer metadata
 * @num:            number of trace buffers
 *
 * The tail indexes indicate where we're read to so far in the data array of a
 * trace buffer.  Each entry in this table corresponds to the tail index for a
 * particular trace buffer.
 */
unsigned long *init_tail_idxs(struct t_buf **bufs, unsigned int num)
{
    int i;
    unsigned long *tails = calloc(num, sizeof(unsigned int));

    if ( tails == NULL )
    {
        perror("Failed to allocate memory for tail pointers\n");
        exit(EXIT_FAILURE);
    }

    for ( i = 0; i<num; i++ )
        tails[i] = _atomic_read(bufs[i]->rec_idx);

    return tails;
}

/**
 * get_num_cpus - get the number of logical CPUs
 */
unsigned int get_num_cpus()
{
    dom0_op_t op;
    int xc_handle = xc_interface_open();
    int ret;

    op.cmd = DOM0_PHYSINFO;
    op.interface_version = DOM0_INTERFACE_VERSION;

    ret = xc_dom0_op(xc_handle, &op);

    if ( ret != 0 )
    {
        perror("Failure to get logical CPU count from Xen");
        exit(EXIT_FAILURE);
    }

    xc_interface_close(xc_handle);

    return (op.u.physinfo.threads_per_core *
            op.u.physinfo.cores_per_socket *
            op.u.physinfo.sockets_per_node *
            op.u.physinfo.nr_nodes);
}


/**
 * monitor_tbufs - monitor the contents of tbufs
 */
int monitor_tbufs()
{
    int i;
    extern void process_record(int, struct t_rec *);
    void *tbufs_mapped;          /* pointer to where the tbufs are mapped    */
    struct t_buf **meta;         /* pointers to the trace buffer metadata    */
    struct t_rec **data;         /* pointers to the trace buffer data areas
                                  * where they are mapped into user space.   */
    unsigned long *cons;         /* store tail indexes for the trace buffers */
    unsigned long tbufs_mach;    /* machine address of the tbufs             */
    unsigned int  num;           /* number of trace buffers / logical CPUS   */
    unsigned long size;          /* size of a single trace buffer            */

    int size_in_recs;

    /* get number of logical CPUs (and therefore number of trace buffers) */
    num = get_num_cpus();

    /* setup access to trace buffers */
    get_tbufs(&tbufs_mach, &size);

    printf("from dom0op: %ld, t_buf: %d, t_rec: %d\n",
            size, sizeof(struct t_buf), sizeof(struct t_rec));

    tbufs_mapped = map_tbufs(tbufs_mach, num, size);

    size_in_recs = (size - sizeof(struct t_buf)) / sizeof(struct t_rec);
    fprintf(stderr, "size_in_recs = %d\n", size_in_recs);

    /* build arrays of convenience ptrs */
    meta  = init_bufs_ptrs (tbufs_mapped, num, size);
    data  = init_rec_ptrs  (tbufs_mach, tbufs_mapped, meta, num);
    cons  = init_tail_idxs (meta, num);

    /* now, scan buffers for events */
    while ( !interrupted )
    {
        for ( i = 0; ( i < num ) && !interrupted; i++ )
            while( cons[i] != _atomic_read(meta[i]->rec_idx) )
            {
                //                write_rec(i, data[i] + cons[i], logfile);
                process_record(i, data[i] + cons[i]);
                cons[i] = (cons[i] + 1) % size_in_recs;
            }

        nanosleep(&opts.poll_sleep, NULL);
    }

    /* cleanup */
    free(meta);
    free(data);
    free(cons);
    /* don't need to munmap - cleanup is automatic */

    return 0;
}


/******************************************************************************
 * Various declarations / definitions GNU argp needs to do its work
 *****************************************************************************/


/* command parser for GNU argp - see GNU docs for more info */
error_t cmd_parser(int key, char *arg, struct argp_state *state)
{
    settings_t *setup = (settings_t *)state->input;

    switch ( key )
    {
        case 't': /* set new records threshold for logging */
            {
                char *inval;
                setup->new_data_thresh = strtol(arg, &inval, 0);
                if ( inval == arg )
                    argp_usage(state);
            }
            break;

        case 's': /* set sleep time (given in milliseconds) */
            {
                char *inval;
                setup->poll_sleep = millis_to_timespec(strtol(arg, &inval, 0));
                if ( inval == arg )
                    argp_usage(state);
            }
            break;

        case 'm': /* set ms_per_sample */
            {
                char *inval;
                setup->ms_per_sample = strtol(arg, &inval, 0);
                if ( inval == arg )
                    argp_usage(state);
            }
            break;

        case 'f': /* set cpu_freq */
            {
                char *inval;
                setup->cpu_freq = strtod(arg, &inval);
                if ( inval == arg )
                    argp_usage(state);
            }
            break;

        case ARGP_KEY_ARG:
            {
                if ( state->arg_num == 0 )
                    setup->outfile = arg;
                else
                    argp_usage(state);
            }
            break;

        default:
            return ARGP_ERR_UNKNOWN;
    }

    return 0;
}

#define SHARED_MEM_FILE "/tmp/xenq-shm"
void alloc_qos_data(void)
{
    int i, pgsize;
    char *dummy;
    int qos_fd;

    qos_fd = open(SHARED_MEM_FILE, O_RDWR|O_CREAT|O_TRUNC, 0777);
    if (qos_fd < 0) {
        perror(SHARED_MEM_FILE);
        exit(2);
    }
    pgsize = getpagesize();
    dummy = malloc(pgsize);

    for (i=0; i<sizeof(_new_qos_data); i=i+pgsize)
        write(qos_fd, dummy, pgsize);

    new_qos = (_new_qos_data *) mmap(0, sizeof(_new_qos_data), PROT_READ|PROT_WRITE, 
            MAP_SHARED, qos_fd, 0);
    if (new_qos == NULL) {
        perror("mmap");
        exit(3);
    }
    //  printf("new_qos = %p\n", new_qos);
}


#define xstr(x) str(x)
#define str(x) #x

const struct argp_option cmd_opts[] =
{
    { .name = "log-thresh", .key='t', .arg="l",
        .doc =
            "Set number, l, of new records required to trigger a write to output "
            "(default " xstr(NEW_DATA_THRESH) ")." },

    { .name = "poll-sleep", .key='s', .arg="p",
        .doc = 
            "Set sleep time, p, in milliseconds between polling the trace buffer "
            "for new data (default " xstr(POLL_SLEEP_MILLIS) ")." },

    { .name = "ms_per_sample", .key='m', .arg="MS",
        .doc = 
            "Specify the number of milliseconds per sample "
            " (default " xstr(MS_PER_SAMPLE) ")." },

    { .name = "cpu_freq", .key='f', .arg="Mhz",
        .doc = 
            "Specify the frequency of the measured cpu "
            " (default " xstr(CPU_FREQ) ")." },

    {0}
};

const struct argp parser_def =
{
    .options = cmd_opts,
    .parser = cmd_parser,
    //    .args_doc = "[output file]",
    .doc =
        "Tool to capture and partially process Xen trace buffer data"
        "\v"
        "This tool is used to capture trace buffer data from Xen.  The data is "
        "saved in a shared memory structure to be further processed by xenmon."
};


const char *argp_program_version     = "xenbaked v1.0";
const char *argp_program_bug_address = "<lucy.cherkasova@hp.com>";


int main(int argc, char **argv)
{
    int ret;
    struct sigaction act;

    time(&start_time);
    opts.outfile = 0;
    opts.poll_sleep = millis_to_timespec(POLL_SLEEP_MILLIS);
    opts.new_data_thresh = NEW_DATA_THRESH;
    opts.ms_per_sample = MS_PER_SAMPLE;
    opts.cpu_freq = CPU_FREQ;

    argp_parse(&parser_def, argc, argv, 0, 0, &opts);
    fprintf(stderr, "ms_per_sample = %ld, cpu_freq=%f\n",
            opts.ms_per_sample, opts.cpu_freq);

    alloc_qos_data();
    memset(new_qos, 0, sizeof(_new_qos_data));

    /* ensure that if we get a signal, we'll do cleanup, then exit */
    act.sa_handler = close_handler;
    act.sa_flags = 0;
    sigemptyset(&act.sa_mask);
    sigaction(SIGHUP,  &act, NULL);
    sigaction(SIGTERM, &act, NULL);
    sigaction(SIGINT,  &act, NULL);

    ret = monitor_tbufs();

    dump_stats();
    msync(new_qos, sizeof(_new_qos_data), MS_SYNC);

    return ret;
}

int domain_runnable(int domid)
{
    return new_qos->domain_info[ID(domid)].runnable;
}


void update_blocked_time(int domid, u64 now)
{
    u64 t_blocked;
    int id = ID(domid);

    if (new_qos->domain_info[id].blocked_start_time != 0) {
        if (now >= new_qos->domain_info[id].blocked_start_time)
            t_blocked = now - new_qos->domain_info[id].blocked_start_time;
        else
            t_blocked = now + (~0LL - new_qos->domain_info[id].blocked_start_time);
        new_qos->qdata[new_qos->next_datapoint].ns_blocked[id] += t_blocked;
    }

    if (domain_runnable(id))
        new_qos->domain_info[id].blocked_start_time = 0;
    else
        new_qos->domain_info[id].blocked_start_time = now;
}


// advance to next datapoint for all domains
void advance_next_datapoint(u64 now)
{
    int new, old, didx;

    old = new_qos->next_datapoint;
    new = QOS_INCR(old);
    new_qos->next_datapoint = new;
    //	memset(&new_qos->qdata[new], 0, sizeof(u64)*(2+5*NDOMAINS));
    for (didx = 0; didx < NDOMAINS; didx++) {
        new_qos->qdata[new].ns_gotten[didx] = 0;
        new_qos->qdata[new].ns_allocated[didx] = 0;
        new_qos->qdata[new].ns_waiting[didx] = 0;
        new_qos->qdata[new].ns_blocked[didx] = 0;
        new_qos->qdata[new].switchin_count[didx] = 0;
        new_qos->qdata[new].io_count[didx] = 0;
    }
    new_qos->qdata[new].ns_passed = 0;

    new_qos->qdata[new].timestamp = now;
}



void qos_update_thread(int domid, u64 now)
{
    int n, id;
    u64 last_update_time, time_since_update;
    u64 start, run_time = 0;

    id = ID(domid);

    n = new_qos->next_datapoint;
    last_update_time = new_qos->domain_info[id].last_update_time;

    // handle wraparound
    if (last_update_time > now)
        time_since_update = now + (~0LL - last_update_time);
    else
        time_since_update = now - last_update_time;


    new_qos->domain_info[id].last_update_time = now;

    if (new_qos->domain_info[id].runnable_at_last_update && (domid == current)) {
        start = new_qos->domain_info[id].start_time;
        if (start > now) {		// wrapped around
            run_time = now + (~0LL - start);
        }
        else
            run_time = now - start;
        new_qos->domain_info[id].ns_oncpu_since_boot += run_time;
        new_qos->domain_info[id].start_time = now;
        new_qos->domain_info[id].ns_since_boot += time_since_update;
    }

    //  new_qos->qdata[n].ns_gotten[id] += run_time;

    new_qos->domain_info[id].runnable_at_last_update = domain_runnable(domid);

    update_blocked_time(domid, now);

    // how much time passed since this datapoint was updated?
    if (now >= new_qos->qdata[n].timestamp) {
        // all is right with the world, time is increasing
        new_qos->qdata[n].ns_passed += (now - new_qos->qdata[n].timestamp);
    }
    else {
        // time wrapped around
        //new_qos->qdata[n].ns_passed += (now + (~0LL - new_qos->qdata[n].timestamp));
        //    printf("why timewrap?\r\n");
    }
    new_qos->qdata[n].timestamp = now;
}


// called by dump routines to update all structures
void qos_update_all(u64 now)
{
    int i;

    for (i=0; i<NDOMAINS; i++)
        if (new_qos->domain_info[i].in_use)
            qos_update_thread(i, now);
}


void qos_update_thread_stats(int domid, u64 now)
{
    if (new_qos->qdata[new_qos->next_datapoint].ns_passed > (million*opts.ms_per_sample)) {
        qos_update_all(now);
        advance_next_datapoint(now);
        return;
    }
    qos_update_thread(domid, now);
}


void qos_init_domain(int cpu, int domid, u64 now)
{
    int i, id;

    id = ID(domid);

    if (new_qos->domain_info[id].in_use)
        return;


    memset(&new_qos->domain_info[id], 0, sizeof(_domain_info));
    new_qos->domain_info[id].last_update_time = now;
    //  runnable_start_time[id] = 0;
    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
    new_qos->domain_info[id].in_use = 1;
    new_qos->domain_info[id].blocked_start_time = 0;
    new_qos->domain_info[id].id = id;
    if (domid == IDLE_DOMAIN_ID)
        sprintf(new_qos->domain_info[id].name, "Idle Task%d", cpu);
    else
        sprintf(new_qos->domain_info[id].name, "Domain#%d", domid);

    for (i=0; i<NSAMPLES; i++) {
        new_qos->qdata[i].ns_gotten[id] = 0;
        new_qos->qdata[i].ns_allocated[id] = 0;
        new_qos->qdata[i].ns_waiting[id] = 0;
        new_qos->qdata[i].ns_blocked[id] = 0;
        new_qos->qdata[i].switchin_count[id] = 0;
        new_qos->qdata[i].io_count[id] = 0;
    }
}


// called when a new thread gets the cpu
void qos_switch_in(int domid, u64 now, unsigned long ns_alloc, unsigned long ns_waited)
{
    int id = ID(domid);

    new_qos->domain_info[id].runnable = 1;
    update_blocked_time(domid, now);
    new_qos->domain_info[id].blocked_start_time = 0; // invalidate
    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
    //runnable_start_time[id] = 0;

    new_qos->domain_info[id].start_time = now;
    new_qos->qdata[new_qos->next_datapoint].switchin_count[id]++;
    new_qos->qdata[new_qos->next_datapoint].ns_allocated[id] += ns_alloc;
    new_qos->qdata[new_qos->next_datapoint].ns_waiting[id] += ns_waited;
    current = id;
    qos_update_thread_stats(domid, now);
}

// called when the current thread is taken off the cpu
void qos_switch_out(int domid, u64 now, unsigned long gotten)
{
    int id = ID(domid);
    int n;

    if (id != current) {
        //    printf("switching out domain %d but it is not current. gotten=%ld\r\n", id, gotten);
    }

    if (gotten == 0) {
        printf("gotten==0 in qos_switchout(domid=%d)\n", domid);
    }

    if (gotten < 1000) {
        printf("gotten<1000ns in qos_switchout(domid=%d)\n", domid);
    }


    n = new_qos->next_datapoint;
    new_qos->qdata[n].ns_gotten[id] += gotten;
    new_qos->domain_info[id].ns_oncpu_since_boot += gotten;
    new_qos->domain_info[id].runnable_start_time = now;
    //  runnable_start_time[id] = now;
    qos_update_thread_stats(id, now);
}

// called when domain is put to sleep, may also be called
// when thread is already asleep
void qos_state_sleeping(int domid, u64 now) 
{
    int id = ID(domid);

    if (!domain_runnable(id))	// double call?
        return;

    new_qos->domain_info[id].runnable = 0;
    new_qos->domain_info[id].blocked_start_time = now;
    new_qos->domain_info[id].runnable_start_time = 0; // invalidate
    //  runnable_start_time[id] = 0; // invalidate
    qos_update_thread_stats(domid, now);
}



void qos_kill_thread(int domid)
{
    new_qos->domain_info[ID(domid)].in_use = 0;
}


// called when thread becomes runnable, may also be called
// when thread is already runnable
void qos_state_runnable(int domid, u64 now)
{
    int id = ID(domid);

    if (domain_runnable(id))	// double call?
        return;
    new_qos->domain_info[id].runnable = 1;
    update_blocked_time(domid, now);

    qos_update_thread_stats(domid, now);

    new_qos->domain_info[id].blocked_start_time = 0; /* invalidate */
    new_qos->domain_info[id].runnable_start_time = now;
    //  runnable_start_time[id] = now;
}


void qos_count_packets(domid_t domid, u64 now)
{
    int id = ID(domid);

    if (new_qos->domain_info[id].in_use) {
        new_qos->qdata[new_qos->next_datapoint].io_count[id]++;
    }
    new_qos->qdata[new_qos->next_datapoint].io_count[0]++;
}


int domain_ok(int cpu, int domid, u64 now)
{
    if (domid == IDLE_DOMAIN_ID)
        domid = NDOMAINS-1;		// ooops, have to handle multiple idle domains
    if (domid < 0 || domid >= NDOMAINS) {
        printf("bad domain id: %d\n", domid);
        return 0;
    }
    if (new_qos->domain_info[domid].in_use == 0)
        qos_init_domain(cpu, domid, now);
    return 1;
}


void process_record(int cpu, struct t_rec *r)
{
    u64 now;
    static struct t_rec prev_rec, curr_rec, tmp_rec;
    static int firstcall = 1;
    int seqdiff;


    if (firstcall) {
        firstcall = 0;
        curr_rec = *r;
        return;
    }

    prev_rec = curr_rec;
    curr_rec = *r;

    // calculate different between current sequence number and last one
    seqdiff = curr_rec.seqno - prev_rec.seqno;


    // cycle timestamp is out of order; swap records
    // with priority over out of order sequence numbers
    if (prev_rec.cycles > curr_rec.cycles) {
        tmp_rec = curr_rec;		// so swap them
        timewarps++;
#if 0
        printf("decreasing now value, seq #'s: 0x%lx 0x%lx\n",
                prev_rec.seqno, curr_rec.seqno);
        printf("  prev_cycles = 0x%llx\n  curr_cycles = 0x%llx\n",
                prev_rec.cycles, curr_rec.cycles);
#endif
        curr_rec = prev_rec;
        prev_rec = tmp_rec;
        r = &prev_rec;
        seq_bad++;
    }
    else if (seqdiff == -1) {     // out of order record: prev == current + 1
        tmp_rec = curr_rec;		// so swap them
        curr_rec = prev_rec;
        prev_rec = tmp_rec;
        r = &prev_rec;
        seq_bad++;
        //    printf("OOO Sequence #0x%lx\n", r->seqno);
    }

    if (seqdiff > 1) {	// lost records: current == prev + N
        seq_bad++;
        new_qos->qdata[new_qos->next_datapoint].lost_records += seqdiff;
    }
    else if (seqdiff == 1) {	// normal: current == prev + 1 
        r = &prev_rec;
        seq_ok++;
    }
    else if (seqdiff < -1) {      // confusion: current == prev - N
        printf("utter chaos\n");
    }


    // at this point, r points to the record to be processed

    rec_count++;

    now = ((double)r->cycles) / (opts.cpu_freq / 1000.0);

    log_event(r->event);

    switch (r->event) {

        case TRC_SCHED_SWITCH_INFPREV:
            // domain data[0] just switched out and received data[1] ns of cpu time
            if (domain_ok(cpu, r->data[0], now))
                qos_switch_out(r->data[0], now, r->data[1]);
            //    printf("ns_gotten %ld\n", r->data[1]);
            break;

        case TRC_SCHED_SWITCH_INFNEXT:
            // domain data[0] just switched in and
            // waited data[1] ns, and was allocated data[2] ns of cpu time
            if (domain_ok(cpu, r->data[0], now))
                qos_switch_in(r->data[0], now, r->data[2], r->data[1]);
            break;

        case TRC_SCHED_DOM_ADD:
            if (domain_ok(cpu, r->data[0], now))
                qos_init_domain(cpu, r->data[0],  now);
            break;

        case TRC_SCHED_DOM_REM:
            if (domain_ok(cpu, r->data[0], now))
                qos_kill_thread(r->data[0]);
            break;

        case TRC_SCHED_SLEEP:
            if (domain_ok(cpu, r->data[0], now))
                qos_state_sleeping(r->data[0], now);
            break;

        case TRC_SCHED_WAKE:
            if (domain_ok(cpu, r->data[0], now))
                qos_state_runnable(r->data[0], now);
            break;

        case TRC_SCHED_BLOCK:
            if (domain_ok(cpu, r->data[0], now))
                qos_state_sleeping(r->data[0], now);
            break;

        case TRC_MEM_PAGE_FLIP:
            if (domain_ok(cpu, r->data[0], now))
                qos_count_packets(r->data[0], now);
            break;

        default:
            break;
    }
}



