WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-ppc-devel

[XenPPC] copy_page speedup using dcbz on target

Using dcbz avoids first reading a cache line from memory before writing to the 
line.
Timing results (starting with clean cache, ie no write-backs for dirty lines):

JS20:
elapsed time: 0x0000000000009f5e
elapsed time using dcbz: 0x000000000000569e

elapsed time: 0x0000000000009fe9
elapsed time using dcbz: 0x0000000000005765


JS21:
elapsed time: 0x000000000000089e
elapsed time using dcbz: 0x0000000000000439

elapsed time: 0x0000000000000886
elapsed time using dcbz: 0x0000000000000438

.........................................

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

typedef unsigned char uchar;
typedef unsigned long ulong;

#define LINE_SIZE 128
#define PAGE_SIZE 0x1000

#define BUF1_SIZE (PAGE_SIZE * 64)
#define BUF2_SIZE (PAGE_SIZE)
#define BUF3_SIZE (0x800000)

static __inline__ ulong time_base(void);
static __inline__ void copy_page(void *dp, void *sp);
static __inline__ void cacheable_copy_page(void *dp, void *sp);
static __inline__ void cacheable_clear_page(void *addr);

static uchar clean_cache(uchar *buf3);


int main(int argc, char **argv){

  int i;
  ulong tb1, tb2;
  uchar *buf1, *buf2, *buf3, *bufp;

  buf1 = malloc(BUF1_SIZE + PAGE_SIZE);
  buf2 = malloc(BUF2_SIZE + PAGE_SIZE);
  buf3 = malloc(BUF3_SIZE + PAGE_SIZE);

  buf1 = (uchar *)((ulong)(buf1 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
  buf2 = (uchar *)((ulong)(buf2 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));
  buf3 = (uchar *)((ulong)(buf3 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1));

  memset(buf1, 1, BUF1_SIZE);
  memset(buf2, 2, BUF2_SIZE);
  memset(buf3, 3, BUF3_SIZE);

  clean_cache(buf3);  
  tb1 = time_base();

  for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
      copy_page(bufp, buf2);
      copy_page(bufp+(PAGE_SIZE*1), buf2);
      copy_page(bufp+(PAGE_SIZE*2), buf2);
      copy_page(bufp+(PAGE_SIZE*3), buf2);
      copy_page(bufp+(PAGE_SIZE*4), buf2);
      copy_page(bufp+(PAGE_SIZE*5), buf2);
      copy_page(bufp+(PAGE_SIZE*6), buf2);
      copy_page(bufp+(PAGE_SIZE*7), buf2);
      
      copy_page(bufp+(PAGE_SIZE*8), buf2);
      copy_page(bufp+(PAGE_SIZE*9), buf2);
      copy_page(bufp+(PAGE_SIZE*10), buf2);
      copy_page(bufp+(PAGE_SIZE*11), buf2);
      copy_page(bufp+(PAGE_SIZE*12), buf2);
      copy_page(bufp+(PAGE_SIZE*13), buf2);
      copy_page(bufp+(PAGE_SIZE*14), buf2);
      copy_page(bufp+(PAGE_SIZE*15), buf2);
  }

  tb2 = time_base();
  printf("elapsed time: 0x%016lx\n", tb2 - tb1);


  clean_cache(buf3);  
  tb1 = time_base();

  for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){
      cacheable_copy_page(bufp, buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*1), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*2), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*3), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*4), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*5), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*6), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*7), buf2);
      
      cacheable_copy_page(bufp+(PAGE_SIZE*8), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*9), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*10), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*11), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*12), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*13), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*14), buf2);
      cacheable_copy_page(bufp+(PAGE_SIZE*15), buf2);
  }

  tb2 = time_base();
  printf("elapsed time using dcbz: 0x%016lx\n", tb2 - tb1);

  return(0);
}


static __inline__ ulong time_base(void)
{
        ulong tb;

        __asm__ __volatile__(
        "mftb   %0      # read time base"
        : "=r" (tb));

        return tb;
}


static __inline__ void cacheable_clear_page(void *addr)
{
        ulong lines, line_size;

        line_size = LINE_SIZE;
        lines = PAGE_SIZE / line_size;

        __asm__ __volatile__(
        "mtctr  %1      # clear_page\n\
1:      dcbz    0,%0\n\
        add     %0,%0,%3\n\
        bdnz    1b"
        : "=r" (addr)
        : "r" (lines), "0" (addr), "r" (line_size)
        : "%ctr", "memory");
}


static __inline__ void copy_page(void *dp, void *sp)
{
        ulong dwords, dword_size;

        dword_size = 8;
        dwords = (PAGE_SIZE / dword_size) - 1;

        __asm__ __volatile__(
        "mtctr  %2      # copy_page\n\
        ld      %2,0(%1)\n\
        std     %2,0(%0)\n\
1:      ldu     %2,8(%1)\n\
        stdu    %2,8(%0)\n\
        bdnz    1b"
        : /* no result */
        : "r" (dp), "r" (sp), "r" (dwords)
        : "%ctr", "memory");
}


static __inline__ void cacheable_copy_page(void *dp, void *sp)
{

        cacheable_clear_page(dp);
        copy_page(dp, sp);
}


static uchar clean_cache(uchar *buf3)
{     
      int i;
      uchar uc, *ucp = buf3;

      for (i = 0; i < BUF3_SIZE / LINE_SIZE; i++){
          uc += *ucp;
          ucp += LINE_SIZE;
      }

      return(uc);
}

_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ppc-devel