WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()

To: Jan Beulich <jbeulich@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxx
Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
From: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
Date: Wed, 12 Nov 2008 14:51:49 +0000 (GMT)
Cc:
Delivery-date: Wed, 12 Nov 2008 06:53:01 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <491AB1EC.76E4.0078.0@xxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Jan --

I assume the 12% faster is on a benchmark...
Have you measured how much faster the copy_page_sse2
routine (standalond) is than the memcpy?  Is it a
factor of 2?

Thanks,
Dan

> -----Original Message-----
> From: Jan Beulich [mailto:jbeulich@xxxxxxxxxx]
> Sent: Wednesday, November 12, 2008 2:38 AM
> To: xen-devel@xxxxxxxxxxxxxxxxxxx
> Subject: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
> 
> 
> In top of the highmem asstance hypercalls added earlier, this provides
> a performance improvement of another 12% (measured on Xeon E5345) for
> the page copying case.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
> 
> Index: 2008-10-27/xen/arch/x86/Makefile
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/Makefile     2008-11-11 
> 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/Makefile  2008-11-11 
> 16:18:36.000000000 +0100
> @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
>  obj-y += apic.o
>  obj-y += bitops.o
>  obj-y += clear_page.o
> +obj-y += copy_page.o
>  obj-y += compat.o
>  obj-y += delay.o
>  obj-y += dmi_scan.o
> Index: 2008-10-27/xen/arch/x86/copy_page.S
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ 2008-10-27/xen/arch/x86/copy_page.S       2008-06-03 
> 14:24:57.000000000 +0200
> @@ -0,0 +1,66 @@
> +#include <xen/config.h>
> +#include <asm/page.h>
> +
> +#ifdef __i386__
> +#define src_reg %esi
> +#define dst_reg %edi
> +#define WORD_SIZE 4
> +#define tmp1_reg %eax
> +#define tmp2_reg %edx
> +#define tmp3_reg %ebx
> +#define tmp4_reg %ebp
> +#else
> +#define src_reg %rsi
> +#define dst_reg %rdi
> +#define WORD_SIZE 8
> +#define tmp1_reg %r8
> +#define tmp2_reg %r9
> +#define tmp3_reg %r10
> +#define tmp4_reg %r11
> +#endif
> +
> +ENTRY(copy_page_sse2)
> +#ifdef __i386__
> +        push    %ebx
> +        push    %ebp
> +        push    %esi
> +        push    %edi
> +        mov     6*4(%esp), src_reg
> +        mov     5*4(%esp), dst_reg
> +#endif
> +        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
> +
> +        prefetchnta 2*4*WORD_SIZE(src_reg)
> +        mov     (src_reg), tmp1_reg
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +
> +0:      prefetchnta 3*4*WORD_SIZE(src_reg)
> +1:      add     $4*WORD_SIZE, src_reg
> +        movnti  tmp1_reg, (dst_reg)
> +        mov     (src_reg), tmp1_reg
> +        dec     %ecx
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +        lea     4*WORD_SIZE(dst_reg), dst_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +        jg      0b
> +        jpe     1b
> +
> +        movnti  tmp1_reg, (dst_reg)
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +
> +#ifdef __i386__
> +        pop     %edi
> +        pop     %esi
> +        pop     %ebp
> +        pop     %ebx
> +#endif
> +        sfence
> +        ret
> Index: 2008-10-27/xen/arch/x86/domain.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain.c     2008-11-11 
> 14:55:44.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain.c  2008-11-11 
> 16:24:48.000000000 +0100
> @@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu *
>      /* This page needs to look like a pagetable so that it 
> can be shadowed */
>      pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
>  
> -    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
> +    l4tab = page_to_virt(pg);
> +    copy_page(l4tab, idle_pg_table);
>      l4tab[0] = l4e_empty();
>      l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
>          l4e_from_page(pg, __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/arch/x86/domain_build.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain_build.c       
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain_build.c    2008-11-11 
> 16:18:36.000000000 +0100
> @@ -467,8 +467,9 @@ int __init construct_dom0(
>      /* WARNING: The new domain must have its 'processor' 
> field filled in! */
>      l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc 
> += PAGE_SIZE;
>      l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc 
> += 4*PAGE_SIZE;
> -    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
> -    for (i = 0; i < 4; i++) {
> +    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
> +        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
> +                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
>          l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
>          l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
>              l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, 
> __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/include/asm-x86/page.h
> ===================================================================
> --- 2008-10-27.orig/xen/include/asm-x86/page.h        
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/include/asm-x86/page.h     2008-11-11 
> 16:18:36.000000000 +0100
> @@ -215,7 +215,10 @@ void clear_page_sse2(void *);
>  #define clear_page(_p)      (cpu_has_xmm2 ?                  
>            \
>                               clear_page_sse2((void *)(_p)) : 
>            \
>                               (void)memset((void *)(_p), 0, 
> PAGE_SIZE))
> -#define copy_page(_t,_f)    memcpy((void *)(_t), (void 
> *)(_f), PAGE_SIZE)
> +void copy_page_sse2(void *, const void *);
> +#define copy_page(_t,_f)    (cpu_has_xmm2 ?                  
>            \
> +                             copy_page_sse2(_t, _f) :        
>            \
> +                             (void)memcpy(_t, _f, PAGE_SIZE))
>  
>  #define mfn_valid(mfn)      ((mfn) < max_page)
>  
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel
>

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel