I sent out the old patches. sorry for that.
attached the newest one. Please discard old ones.
On Mon, Jul 24, 2006 at 09:54:28PM +0900, Isaku Yamahata wrote:
> Hi.
>
> I implemented per vcpu VHPT for non-VTi domain.
> The motivation is to alleviate vcpu migration cost between physical cpus
> with credit scheduler.
> If more than one vcpu of same domain, VHPT needs to be flushed every
> vcpu switch. I'd like to avoid this scenario.
> The patch is for discussion and performance evaluation. Not for commit.
>
>
> I checked the mailing list archives and found the thread
> Xen/ia64 - global or per VP VHPT
> http://lists.xensource.com/archives/html/xen-devel/2005-04/msg01002.html
>
> The conclustion at that time isn't concluded.
> (At least my understanding. Because the thread was very long to follow.
> So I might be wrong, correct me.)
> With this patch, we can measure the performance and descide to include
> this patch or discard the idea.
>
>
> This patch introduces compile time optoin, xen_ia64_pervcpu_vhpt=y,
> to enable this feature and xen boot time option, pervcpu_vhpt=0
> to disable per vcpu vhpt allocation.
> The patch depends on tlb tracking patch which I sent before.
> I attached these patches for convinience.
>
> Thanks
> --
> yamahata
> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID c654d462c4481685fb2e803e41cb2beba56bee4b
> # Parent b2abc70be89e02d0d380674096c8c1fb9e552431
> import linux/include/linux/hash.h.
> PATCHNAME: import_linux_hash.h
>
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
>
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/README.origin
> --- a/xen/include/asm-ia64/linux/README.origin Wed Jul 19 07:17:54
> 2006 -0600
> +++ b/xen/include/asm-ia64/linux/README.origin Mon Jul 24 21:34:37
> 2006 +0900
> @@ -8,6 +8,7 @@ bitmap.h -> linux/include/linux/bitmap.
> bitmap.h -> linux/include/linux/bitmap.h
> bitops.h -> linux/include/linux/bitops.h
> initrd.h -> linux/include/linux/initrd.h
> +hash.h -> linux/include/linux/hash.h
> jiffies.h -> linux/include/linux/jiffies.h
> kmalloc_sizes.h -> linux/include/linux/kmalloc_sizes.h
> linkage.h -> linux/include/linux/linkage.h
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/hash.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/linux/hash.h Mon Jul 24 21:34:37 2006 +0900
> @@ -0,0 +1,58 @@
> +#ifndef _LINUX_HASH_H
> +#define _LINUX_HASH_H
> +/* Fast hashing routine for a long.
> + (C) 2002 William Lee Irwin III, IBM */
> +
> +/*
> + * Knuth recommends primes in approximately golden ratio to the maximum
> + * integer representable by a machine word for multiplicative hashing.
> + * Chuck Lever verified the effectiveness of this technique:
> + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
> + *
> + * These primes are chosen to be bit-sparse, that is operations on
> + * them can use shifts and additions instead of multiplications for
> + * machines where multiplications are slow.
> + */
> +#if BITS_PER_LONG == 32
> +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e370001UL
> +#elif BITS_PER_LONG == 64
> +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
> +#else
> +#error Define GOLDEN_RATIO_PRIME for your wordsize.
> +#endif
> +
> +static inline unsigned long hash_long(unsigned long val, unsigned int bits)
> +{
> + unsigned long hash = val;
> +
> +#if BITS_PER_LONG == 64
> + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
> + unsigned long n = hash;
> + n <<= 18;
> + hash -= n;
> + n <<= 33;
> + hash -= n;
> + n <<= 3;
> + hash += n;
> + n <<= 3;
> + hash -= n;
> + n <<= 4;
> + hash += n;
> + n <<= 2;
> + hash += n;
> +#else
> + /* On some cpus multiply is faster, on others gcc will do shifts */
> + hash *= GOLDEN_RATIO_PRIME;
> +#endif
> +
> + /* High bits are more random, so use them. */
> + return hash >> (BITS_PER_LONG - bits);
> +}
> +
> +static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
> +{
> + return hash_long((unsigned long)ptr, bits);
> +}
> +#endif /* _LINUX_HASH_H */
> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> # Parent c654d462c4481685fb2e803e41cb2beba56bee4b
> add tlb insert tracking to do vTLB flush finer grained virtual address
> range when a page is unmapped from a domain.
> This is functionality is enabled with a compile time option,
> xen_ia64_tlb_track=y.
> PATCHNAME: tlb_track
>
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
>
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk Mon Jul 24 21:35:16 2006 +0900
> @@ -39,6 +39,9 @@ ifeq ($(xen_ia64_dom0_virtual_physical),
> ifeq ($(xen_ia64_dom0_virtual_physical),y)
> CFLAGS += -DCONFIG_XEN_IA64_DOM0_VP
> endif
> +ifeq ($(xen_ia64_tlb_track),y)
> +CFLAGS += -DCONFIG_XEN_IA64_TLB_TRACK
> +endif
> ifeq ($(no_warns),y)
> CFLAGS += -Wa,--fatal-warnings -Werror -Wno-uninitialized
> endif
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/Makefile
> --- a/xen/arch/ia64/xen/Makefile Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/Makefile Mon Jul 24 21:35:16 2006 +0900
> @@ -27,3 +27,4 @@ obj-y += privop_stat.o
> obj-y += privop_stat.o
>
> obj-$(crash_debug) += gdbstub.o
> +obj-$(xen_ia64_tlb_track) += tlb_track.o
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c Mon Jul 24 21:35:16 2006 +0900
> @@ -60,6 +60,9 @@
> #include <asm/regionreg.h>
> #include <asm/dom_fw.h>
> #include <asm/privop_stat.h>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +#endif
>
> #ifndef CONFIG_XEN_IA64_DOM0_VP
> #define CONFIG_DOMAIN0_CONTIGUOUS
> @@ -351,6 +354,10 @@ int arch_domain_create(struct domain *d)
> if (is_idle_domain(d))
> return 0;
>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + if (tlb_track_create(d) < 0)
> + goto fail_nomem;
> +#endif
> d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
> if (d->shared_info == NULL)
> goto fail_nomem;
> @@ -389,6 +396,9 @@ void arch_domain_destroy(struct domain *
> if (d->shared_info != NULL)
> free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + tlb_track_destroy(d);
> +#endif
> domain_flush_destroy (d);
>
> deallocate_rid_range(d);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/faults.c
> --- a/xen/arch/ia64/xen/faults.c Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/faults.c Mon Jul 24 21:35:16 2006 +0900
> @@ -27,6 +27,7 @@
> #include <asm/debugger.h>
> #include <asm/fpswa.h>
> #include <asm/bundle.h>
> +#include <asm/p2m_entry.h>
> #include <asm/privop_stat.h>
> #include <asm/asm-xsi-offsets.h>
>
> @@ -202,8 +203,15 @@ void ia64_do_page_fault (unsigned long a
> fault = vcpu_translate(current,address,is_data,&pteval,&itir,&iha);
> if (fault == IA64_NO_FAULT || fault == IA64_USE_TLB) {
> struct p2m_entry entry;
> - pteval = translate_domain_pte(pteval, address, itir, &logps,
> &entry);
> - vcpu_itc_no_srlz(current,is_data?2:1,address,pteval,-1UL,logps);
> + unsigned long m_pteval;
> + m_pteval = translate_domain_pte(pteval, address, itir, &logps,
> &entry);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> + vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4,
> + address, m_pteval, pteval, logps);
> +#else
> + vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4,
> + address, m_pteval, pteval, logps, &entry);
> +#endif
> if ((fault == IA64_USE_TLB && !current->arch.dtlb.pte.p) ||
> p2m_entry_retry(&entry)) {
> /* dtlb has been purged in-between. This dtlb was
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/mm.c
> --- a/xen/arch/ia64/xen/mm.c Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/mm.c Mon Jul 24 21:35:16 2006 +0900
> @@ -170,13 +170,14 @@
> #include <asm/pgalloc.h>
> #include <asm/vhpt.h>
> #include <asm/vcpu.h>
> +#include <asm/p2m_entry.h>
> #include <linux/efi.h>
>
> #ifndef CONFIG_XEN_IA64_DOM0_VP
> #define CONFIG_DOMAIN0_CONTIGUOUS
> #else
> -static void domain_page_flush(struct domain* d, unsigned long mpaddr,
> - unsigned long old_mfn, unsigned long new_mfn);
> +static void domain_page_flush(struct domain* d,
> + volatile pte_t* ptep, pte_t old_pte);
> #endif
>
> static struct domain *dom_xen, *dom_io;
> @@ -718,6 +719,19 @@ void *domain_mpa_to_imva(struct domain *
> }
> #endif
>
> +static unsigned long
> +assign_flags_to_pteflags(unsigned long flags)
> +{
> + unsigned long pteflags =
> + (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + if (flags & ASSIGN_tlb_track) {
> + pteflags |= _PAGE_TLB_TRACKING;
> + }
> +#endif
> + return pteflags;
> +}
> +
> /* Allocate a new page for domain and map it to the specified metaphysical
> address. */
> static struct page_info *
> @@ -811,7 +825,7 @@ assign_new_domain0_page(struct domain *d
> }
>
> /* map a physical address to the specified metaphysical addr */
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx
> // This is called by assign_domain_mmio_page().
> // So accessing to pte is racy.
> void
> @@ -823,13 +837,13 @@ __assign_domain_page(struct domain *d,
> pte_t old_pte;
> pte_t new_pte;
> pte_t ret_pte;
> - unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R:
> _PAGE_AR_RWX;
> + unsigned long pteflags = assign_flags_to_pteflags(flags);
>
> pte = lookup_alloc_domain_pte(d, mpaddr);
>
> old_pte = __pte(0);
> new_pte = pfn_pte(physaddr >> PAGE_SHIFT,
> - __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> + __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
> ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
> if (pte_val(ret_pte) == pte_val(old_pte))
> smp_mb();
> @@ -945,7 +959,7 @@ assign_domain_mach_page(struct domain *d
> // caller must call set_gpfn_from_mfn() before call if necessary.
> // because set_gpfn_from_mfn() result must be visible before pte xchg
> // caller must use memory barrier. NOTE: xchg has acquire semantics.
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx
> static void
> assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
> unsigned long mfn, unsigned long flags)
> @@ -954,11 +968,11 @@ assign_domain_page_replace(struct domain
> volatile pte_t* pte;
> pte_t old_pte;
> pte_t npte;
> - unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R:
> _PAGE_AR_RWX;
> + unsigned long pteflags = assign_flags_to_pteflags(flags);
> pte = lookup_alloc_domain_pte(d, mpaddr);
>
> // update pte
> - npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> + npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
> old_pte = ptep_xchg(mm, mpaddr, pte, npte);
> if (pte_mem(old_pte)) {
> unsigned long old_mfn = pte_pfn(old_pte);
> @@ -978,7 +992,7 @@ assign_domain_page_replace(struct domain
> set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
> }
>
> - domain_page_flush(d, mpaddr, old_mfn, mfn);
> + domain_page_flush(d, pte, old_pte);
>
> try_to_clear_PGC_allocate(d, old_page);
> put_page(old_page);
> @@ -997,29 +1011,29 @@ assign_domain_page_cmpxchg_rel(struct do
> struct mm_struct *mm = &d->arch.mm;
> volatile pte_t* pte;
> unsigned long old_mfn;
> - unsigned long old_arflags;
> + unsigned long old_pteflags;
> pte_t old_pte;
> unsigned long new_mfn;
> - unsigned long new_arflags;
> + unsigned long new_pteflags;
> pte_t new_pte;
> pte_t ret_pte;
>
> pte = lookup_alloc_domain_pte(d, mpaddr);
>
> again:
> - old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
> + old_pteflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
> old_mfn = page_to_mfn(old_page);
> - old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
> + old_pte = pfn_pte(old_mfn, __pgprot(old_pteflags));
> if (!pte_present(old_pte)) {
> - DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
> - __func__, pte_val(old_pte), old_arflags, old_mfn);
> + DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx\n",
> + __func__, pte_val(old_pte), old_pteflags, old_mfn);
> return -EINVAL;
> }
>
> - new_arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> + new_pteflags = assign_flags_to_pteflags(flags);
> new_mfn = page_to_mfn(new_page);
> new_pte = pfn_pte(new_mfn,
> - __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_arflags));
> + __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_pteflags));
>
> // update pte
> ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
> @@ -1028,10 +1042,10 @@ assign_domain_page_cmpxchg_rel(struct do
> goto again;
> }
>
> - DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
> + DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx "
> "ret_pte 0x%lx ret_mfn 0x%lx\n",
> __func__,
> - pte_val(old_pte), old_arflags, old_mfn,
> + pte_val(old_pte), old_pteflags, old_mfn,
> pte_val(ret_pte), pte_pfn(ret_pte));
> return -EINVAL;
> }
> @@ -1043,7 +1057,7 @@ assign_domain_page_cmpxchg_rel(struct do
>
> set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
>
> - domain_page_flush(d, mpaddr, old_mfn, new_mfn);
> + domain_page_flush(d, pte, old_pte);
> put_page(old_page);
> return 0;
> }
> @@ -1111,7 +1125,7 @@ zap_domain_page_one(struct domain *d, un
> set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
> }
>
> - domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
> + domain_page_flush(d, pte, old_pte);
>
> if (page_get_owner(page) != NULL) {
> try_to_clear_PGC_allocate(d, page);
> @@ -1199,8 +1213,12 @@ create_grant_host_mapping(unsigned long
> BUG_ON(ret == 0);
> BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
> get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
> - assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
> - ASSIGN_readonly:
> ASSIGN_writable);
> + assign_domain_page_replace(d, gpaddr, mfn,
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + ASSIGN_tlb_track |
> +#endif
> + ((flags & GNTMAP_readonly) ?
> + ASSIGN_readonly: ASSIGN_writable));
> return GNTST_okay;
> }
>
> @@ -1254,7 +1272,7 @@ destroy_grant_host_mapping(unsigned long
> }
> BUG_ON(pte_pfn(old_pte) != mfn);
>
> - domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
> + domain_page_flush(d, pte, old_pte);
>
> page = mfn_to_page(mfn);
> BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page)
> is not needed.
> @@ -1418,11 +1436,38 @@ guest_physmap_remove_page(struct domain
>
> //XXX sledgehammer.
> // flush finer range.
> -void
> -domain_page_flush(struct domain* d, unsigned long mpaddr,
> - unsigned long old_mfn, unsigned long new_mfn)
> -{
> +static void
> +domain_page_flush(struct domain* d, volatile pte_t* ptep, pte_t old_pte)
> +{
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> domain_flush_vtlb_all();
> +#else
> + struct tlb_track_entry* entry;
> + switch (tlb_track_search_and_remove(d->arch.tlb_track,
> + ptep, old_pte, &entry)) {
> + case TLB_TRACK_NOT_TRACKED:
> + //DPRINTK("%s TLB_TRACK_NOT_TRACKED\n", __func__);
> + domain_flush_vtlb_all();
> + break;
> + case TLB_TRACK_NOT_FOUND:
> + // do nothing
> + //DPRINTK("%s TLB_TRACK_NOT_FOUND\n", __func__);
> + break;
> + case TLB_TRACK_FOUND:
> + //DPRINTK("%s TLB_TRACK_FOUND\n", __func__);
> + domain_flush_vltb_track_entry(d, entry);
> + tlb_track_free_entry(d->arch.tlb_track, entry);
> + break;
> + case TLB_TRACK_MANY:
> + DPRINTK("%s TLB_TRACK_MANY\n", __func__);
> + domain_flush_vtlb_all();
> + break;
> + case TLB_TRACK_AGAIN:
> + DPRINTK("%s TLB_TRACK_AGAIN\n", __func__);
> + BUG();
> + break;
> + }
> +#endif
> }
>
> int
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vcpu.c
> --- a/xen/arch/ia64/xen/vcpu.c Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vcpu.c Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,7 @@
> #include <asm/vmx_phy_mode.h>
> #include <asm/bundle.h>
> #include <asm/privop_stat.h>
> +#include <asm/p2m_entry.h>
>
> /* FIXME: where these declarations should be there ? */
> extern void getreg(unsigned long regnum, unsigned long *val, int *nat,
> struct pt_regs *regs);
> @@ -2003,7 +2004,11 @@ IA64FAULT vcpu_set_dtr(VCPU *vcpu, u64 s
> VCPU translation cache access routines
> **************************************************************************/
>
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte,
> UINT64 mp_pte, UINT64 logps)
> +#else
> +void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte,
> UINT64 mp_pte, UINT64 logps, struct p2m_entry* entry)
> +#endif
> {
> unsigned long psr;
> unsigned long ps = (vcpu->domain==dom0) ? logps : PAGE_SHIFT;
> @@ -2017,6 +2022,9 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
>
> #ifdef CONFIG_XEN_IA64_DOM0_VP
> BUG_ON(logps > PAGE_SHIFT);
> +#endif
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry);
> #endif
> psr = ia64_clear_ic();
> ia64_itc(IorD,vaddr,pte,ps); // FIXME: look for bigger mappings
> @@ -2035,7 +2043,7 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
> // PAGE_SIZE mapping in the vhpt for now, else purging is complicated
> else vhpt_insert(vaddr,pte,PAGE_SHIFT<<2);
> #endif
> - if ((mp_pte == -1UL) || (IorD & 0x4)) // don't place in 1-entry TLB
> + if (IorD & 0x4) // don't place in 1-entry TLB
> return;
> if (IorD & 0x1) {
> vcpu_set_tr_entry(&PSCBX(vcpu,itlb),mp_pte,ps<<2,vaddr);
> @@ -2060,7 +2068,11 @@ again:
> pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
> if (!pteval) return IA64_ILLOP_FAULT;
> if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps);
> +#else
> + vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps,&entry);
> +#endif
> if (swap_rr0) set_metaphysical_rr0();
> if (p2m_entry_retry(&entry)) {
> vcpu_flush_tlb_vhpt_range(ifa, logps);
> @@ -2083,7 +2095,11 @@ again:
> pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
> if (!pteval) return IA64_ILLOP_FAULT;
> if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps);
> +#else
> + vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps,&entry);
> +#endif
> if (swap_rr0) set_metaphysical_rr0();
> if (p2m_entry_retry(&entry)) {
> vcpu_flush_tlb_vhpt_range(ifa, logps);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c Mon Jul 24 21:35:16 2006 +0900
> @@ -227,6 +227,48 @@ void domain_flush_vtlb_range (struct dom
> ia64_global_tlb_purge(vadr,vadr+addr_range,PAGE_SHIFT);
> }
>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +void
> +domain_flush_vltb_track_entry(struct domain* d,
> + const struct tlb_track_entry* entry)
> +{
> + unsigned long old_rid;
> + struct vcpu* v;
> + int cpu;
> +
> + //tlb_track_entry_printf(entry);
> + vcpu_get_rr(current, 0, &old_rid);
> + vcpu_set_rr(current, 0, entry->rid);
> +
> + for_each_vcpu(d, v) {
> + if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> + continue;
> + if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> + continue;
> +
> + /* Purge TC entries.
> + FIXME: clear only if match. */
> + vcpu_purge_tr_entry(&PSCBX(v, dtlb));
> + vcpu_purge_tr_entry(&PSCBX(v, itlb));
> + }
> + smp_mb();
> +
> + for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> + //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
> + /* Invalidate VHPT entries. */
> + cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> + }
> + // ptc.ga has release semantics.
> +
> + /* ptc.ga */
> + ia64_global_tlb_purge(entry->vaddr, entry->vaddr + PAGE_SIZE,
> + PAGE_SHIFT);
> +
> + vcpu_set_rr(current, 0, old_rid);
> +}
> +#endif
> +
> static void flush_tlb_vhpt_all (struct domain *d)
> {
> /* First VHPT. */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h Mon Jul 24 21:35:16 2006 +0900
> @@ -12,28 +12,10 @@
> #include <xen/cpumask.h>
> #include <asm/fpswa.h>
>
> -struct p2m_entry {
> - volatile pte_t* pte;
> - pte_t used;
> -};
> -
> -static inline void
> -p2m_entry_set(struct p2m_entry* entry, volatile pte_t* pte, pte_t used)
> -{
> - entry->pte = pte;
> - entry->used = used;
> -}
> -
> -static inline int
> -p2m_entry_retry(struct p2m_entry* entry)
> -{
> - //XXX see lookup_domain_pte().
> - // NULL is set for invalid gpaddr for the time being.
> - if (entry->pte == NULL)
> - return 0;
> -
> - return (pte_val(*entry->pte) != pte_val(entry->used));
> -}
> +struct p2m_entry;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track;
> +#endif
>
> extern void domain_relinquish_resources(struct domain *);
>
> @@ -118,6 +100,10 @@ struct arch_domain {
> void *fpswa_inf;
>
> struct last_vcpu last_vcpu[NR_CPUS];
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + struct tlb_track* tlb_track;
> +#endif
> };
> #define INT_ENABLE_OFFSET(v) \
> (sizeof(vcpu_info_t) * (v)->vcpu_id + \
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlbflush.h
> --- a/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,13 @@ void domain_flush_vtlb_all (void);
> /* Global range-flush of vTLB. */
> void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range);
>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track_entry;
> +/* Global entry-flush of vTLB */
> +void domain_flush_vltb_track_entry(struct domain* d,
> + const struct tlb_track_entry* entry);
> +#endif
> +
> /* Final vTLB flush on every dirty cpus. */
> void domain_flush_destroy (struct domain *d);
>
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/vcpu.h
> --- a/xen/include/asm-ia64/vcpu.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/vcpu.h Mon Jul 24 21:35:16 2006 +0900
> @@ -158,7 +158,12 @@ extern void vcpu_set_next_timer(VCPU *vc
> extern void vcpu_set_next_timer(VCPU *vcpu);
> extern BOOLEAN vcpu_timer_expired(VCPU *vcpu);
> extern UINT64 vcpu_deliverable_interrupts(VCPU *vcpu);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64,
> UINT64);
> +#else
> +struct p2m_entry;
> +extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64,
> UINT64, struct p2m_entry*);
> +#endif
> extern UINT64 vcpu_get_tmp(VCPU *, UINT64);
> extern void vcpu_set_tmp(VCPU *, UINT64, UINT64);
>
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/public/arch-ia64.h
> --- a/xen/include/public/arch-ia64.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/public/arch-ia64.h Mon Jul 24 21:35:16 2006 +0900
> @@ -357,8 +357,14 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_guest_conte
> // address space.
> // flags for page assignement to pseudo physical address space
> #define _ASSIGN_readonly 0
> +#define _ASSIGN_tlb_track 1
> +
> #define ASSIGN_readonly (1UL << _ASSIGN_readonly)
> #define ASSIGN_writable (0UL << _ASSIGN_readonly) // dummy
> flag
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +# define ASSIGN_tlb_track (1UL << _ASSIGN_tlb_track)
> +#endif
> +
>
> /* This structure has the same layout of struct ia64_boot_param, defined in
> <asm/system.h>. It is redefined here to ease use. */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/tlb_track.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/ia64/xen/tlb_track.c Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,558 @@
> +/******************************************************************************
> + * tlb_track.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + * VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + *
> + */
> +
> +#include <asm/tlb_track.h>
> +#include <asm/p2m_entry.h>
> +#include <asm/vmx_mm_def.h> // for IA64_RR_SHIFT
> +#include <asm/vcpu.h> // for PSCB()
> +
> +#define CONFIG_TLB_TRACK_DEBUG
> +#ifdef CONFIG_TLB_TRACK_DEBUG
> +# define tlb_track_printd(fmt, ...) \
> + printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__)
> +#else
> +# define tlb_track_printd(fmt, ...) do { } while (0)
> +#endif
> +
> +#define CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#include <asm/regs.h>
> +#include <xen/keyhandler.h>
> +
> +static void
> +dump_tlb_track_stat(unsigned char key)
> +{
> + tlb_track_stat_printf(&dom0->arch.tlb_track->stat);
> +}
> +#endif
> +
> +static int
> +tlb_track_allocate_entries(struct tlb_track* tlb_track)
> +{
> + struct page_info* entry_page;
> + struct tlb_track_entry* track_entries;
> + unsigned int allocated;
> + unsigned long i;
> +
> + BUG_ON(tlb_track->num_free > 0);
> + if (tlb_track->num_entries >= tlb_track->limit) {
> + DPRINTK("%s: num_entries %d limit %d\n",
> + __func__, tlb_track->num_entries, tlb_track->limit);
> + return -ENOMEM;
> + }
> + entry_page = alloc_domheap_page(NULL);
> + if (entry_page == NULL) {
> + DPRINTK("%s: domheap page failed. num_entries %d limit %d\n",
> + __func__, tlb_track->num_entries, tlb_track->limit);
> + return -ENOMEM;
> + }
> +
> + list_add(&entry_page->list, &tlb_track->page_list);
> + track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
> + allocated = PAGE_SIZE / sizeof(track_entries[0]);
> + tlb_track->num_entries += allocated;
> + tlb_track->num_free += allocated;
> + for (i = 0; i < allocated; i++) {
> + list_add(&track_entries[i].list, &tlb_track->free_list);
> + //tlb_track_printd("track_entries[%ld] 0x%p\n", i,
> &track_entries[i]);
> + }
> + tlb_track_printd("allocated %d num_entries %d num_free %d\n",
> + allocated, tlb_track->num_entries, tlb_track->num_free);
> + return 0;
> +}
> +
> +
> +int
> +tlb_track_create(struct domain* d)
> +{
> + struct tlb_track* tlb_track = NULL;
> + struct page_info* hash_page = NULL;
> + unsigned int hash_size;
> + unsigned int hash_shift;
> + unsigned int i;
> +
> + tlb_track = xmalloc(struct tlb_track);
> + if (tlb_track == NULL) {
> + goto out;
> + }
> + hash_page = alloc_domheap_page(NULL);
> + if (hash_page == NULL) {
> + goto out;
> + }
> +
> + spin_lock_init(&tlb_track->free_list_lock);
> + INIT_LIST_HEAD(&tlb_track->free_list);
> + tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
> + tlb_track->num_entries = 0;
> + tlb_track->num_free = 0;
> + INIT_LIST_HEAD(&tlb_track->page_list);
> + if (tlb_track_allocate_entries(tlb_track) < 0) {
> + goto out;
> + }
> +
> + spin_lock_init(&tlb_track->hash_lock);
> + //XXX hash size optimization
> + hash_size = PAGE_SIZE / sizeof(tlb_track->hash[0]);
> + for (hash_shift = 0; (1 << (hash_shift + 1)) < hash_size; hash_shift++)
> + /* nothing */;
> + tlb_track->hash_size = (1 << hash_shift);
> + tlb_track->hash_shift = hash_shift;
> + tlb_track->hash_mask = (1 << hash_shift) - 1;
> + tlb_track->hash = page_to_virt(hash_page);
> + for (i = 0; i < tlb_track->hash_size; i++) {
> + INIT_LIST_HEAD(&tlb_track->hash[i]);
> + }
> +
> + memset(&tlb_track->stat, 0, sizeof(tlb_track->stat));
> +
> + smp_mb(); // make initialization visible before use.
> + d->arch.tlb_track = tlb_track;
> + printk("%s:%d hash 0x%p hash_size %d \n",
> + __func__, __LINE__, tlb_track->hash, tlb_track->hash_size);
> +
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> + register_keyhandler(
> + 's', dump_tlb_track_stat, "dump dom0 tlb track stats");
> +#endif
> + return 0;
> +
> +out:
> + if (hash_page != NULL) {
> + free_domheap_page(hash_page);
> + }
> + if (tlb_track != NULL) {
> + xfree(tlb_track);
> + }
> + return -ENOMEM;
> +}
> +
> +void
> +tlb_track_destroy(struct domain* d)
> +{
> + struct tlb_track* tlb_track = d->arch.tlb_track;
> + struct page_info* page;
> + struct page_info* next;
> +
> + spin_lock(&tlb_track->free_list_lock);
> + BUG_ON(tlb_track->num_free != tlb_track->num_entries);
> +
> + list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
> + list_del(&page->list);
> + free_domheap_page(page);
> + }
> +
> + free_domheap_page(virt_to_page(tlb_track->hash));
> + xfree(tlb_track);
> + //d->tlb_track = NULL;
> +}
> +
> +static struct tlb_track_entry*
> +tlb_track_get_entry(struct tlb_track* tlb_track)
> +{
> + struct tlb_track_entry* entry = NULL;
> + spin_lock(&tlb_track->free_list_lock);
> + if (tlb_track->num_free == 0) {
> + (void)tlb_track_allocate_entries(tlb_track);
> + }
> + if (tlb_track->num_free > 0) {
> + BUG_ON(list_empty(&tlb_track->free_list));
> + entry = list_entry(tlb_track->free_list.next,
> + struct tlb_track_entry, list);
> + tlb_track->num_free--;
> + list_del(&entry->list);
> + }
> + spin_unlock(&tlb_track->free_list_lock);
> + return entry;
> +}
> +
> +void
> +tlb_track_free_entry(struct tlb_track* tlb_track,
> + struct tlb_track_entry* entry)
> +{
> + spin_lock(&tlb_track->free_list_lock);
> + list_add(&entry->list, &tlb_track->free_list);
> + tlb_track->num_free++;
> + spin_unlock(&tlb_track->free_list_lock);
> +}
> +
> +
> +#include <linux/hash.h>
> +// XXX hash function.
> +static struct list_head*
> +tlb_track_hash_head(struct tlb_track* tlb_track, volatile pte_t* ptep)
> +{
> + unsigned long hash = hash_long((unsigned long)ptep,
> tlb_track->hash_shift);
> + BUG_ON(hash >= tlb_track->hash_size);
> + BUG_ON((hash & tlb_track->hash_mask) != hash);
> + return &tlb_track->hash[hash];
> +}
> +
> +static int
> +tlb_track_pte_zapped(pte_t old_pte, pte_t ret_pte)
> +{
> + if (pte_pfn(old_pte) != pte_pfn(ret_pte) ||
> + (pte_val(old_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK)) !=
> + (pte_val(ret_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK))) {
> + // Other thread zapped the p2m entry.
> + return 1;
> + }
> + return 0;
> +}
> +
> +static TLB_TRACK_RET_T
> +tlb_track_insert_or_dirty(struct tlb_track* tlb_track, struct mm_struct* mm,
> + volatile pte_t* ptep, pte_t old_pte,
> + unsigned long vaddr, unsigned long rid)
> +{
> + unsigned long mfn = pte_pfn(old_pte);
> + struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> + struct tlb_track_entry* entry;
> + struct tlb_track_entry* new_entry = NULL;
> + unsigned long bit_to_be_set = _PAGE_TLB_INSERTED;
> + pte_t new_pte;
> + pte_t ret_pte;
> +
> + struct vcpu* v = current;
> + TLB_TRACK_RET_T ret = TLB_TRACK_NOT_FOUND;
> +
> + tlb_track->stat.iod++;
> + if (!pte_tlb_tracking(old_pte)) {
> + tlb_track->stat.iod_not_tracked++;
> + return TLB_TRACK_NOT_TRACKED;
> + }
> + if (pte_tlb_inserted_many(old_pte)) {
> + tlb_track->stat.iod_tracked_many++;
> + return TLB_TRACK_MANY;
> + }
> +
> + // vaddr must be normalized so that it is in rr0 and page aligned.
> + BUG_ON((vaddr >> IA64_RR_SHIFT) != 0);
> + BUG_ON((vaddr & ~PAGE_MASK) != 0);
> +#if 0
> + tlb_track_printd("\n"
> + "\tmfn 0x%016lx\n"
> + "\told_pte 0x%016lx ptep 0x%p\n"
> + "\tptep_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> + "\ttlb_track 0x%p head 0x%p\n",
> + mfn,
> + pte_val(old_pte), ptep, pte_val(*ptep),
> + vaddr, rid,
> + tlb_track, head);
> +#endif
> +
> + again:
> + // zapping side may zap the p2m entry and then remove tlb track entry
> + // non-atomically. We may see the stale tlb track entry here.
> + // p2m_entry_retry() handles such a case.
> + // Or other thread may zap the p2m entry and remove tlb track entry
> + // and inserted new tlb track entry.
> + spin_lock(&tlb_track->hash_lock);
> + list_for_each_entry(entry, head, list) {
> + if (entry->ptep != ptep) {
> + continue;
> + }
> +
> + if (pte_pfn(entry->pte_val) == mfn) {
> + //tlb_track_entry_printf(entry);
> + if (entry->vaddr == vaddr && entry->rid == rid) {
> + //tlb_track_printd("TLB_TRACK_FOUND\n");
> + ret = TLB_TRACK_FOUND;
> + tlb_track->stat.iod_found++;
> +#ifdef CONFIG_TLB_TRACK_CNT
> + entry->cnt++;
> + if (entry->cnt > TLB_TRACK_CNT_FORCE_MANY) {
> + // heuristics:
> + // If a page is used to transfer data by dev channel,
> + // it would be unmapped with small amount access
> + // (once or twice tlb insert) after real device
> + // I/O completion. It would be short period.
> + // However this page seems to be accessed many times.
> + // We guess that this page is used I/O ring
> + // so that tracking this entry might be useless.
> + //tlb_track_entry_printf(entry);
> + //tlb_track_printd("cnt = %ld\n", entry->cnt);
> + tlb_track->stat.iod_force_many++;
> + goto force_many;
> + }
> +#endif
> + goto found;
> + } else {
> +#ifdef CONFIG_TLB_TRACK_CNT
> + force_many:
> +#endif
> + if (!pte_tlb_inserted(old_pte)) {
> + printk("%s:%d racy update\n", __func__, __LINE__);
> + old_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED);
> + }
> + new_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED_MANY);
> + ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte,
> new_pte);
> + if (pte_val(ret_pte) != pte_val(old_pte)) {
> + //tlb_track_printd("TLB_TRACK_AGAIN\n");
> + ret = TLB_TRACK_AGAIN;
> + tlb_track->stat.iod_again++;
> + } else {
> + //tlb_track_printd("TLB_TRACK_MANY del entry 0x%p\n",
> entry);
> + ret = TLB_TRACK_MANY;
> + list_del(&entry->list);
> + //tlb_track_entry_printf(entry);
> + tlb_track->stat.iod_tracked_many_del++;
> + }
> + goto out;
> + }
> + }
> +
> + // Other thread changed the p2m entry and removed and inserted new
> + // tlb tracn entry after we get old_pte, but before we get
> + // spinlock.
> + //tlb_track_printd("TLB_TRACK_AGAIN\n");
> + ret = TLB_TRACK_AGAIN;
> + tlb_track->stat.iod_again++;
> + goto out;
> + }
> +
> + entry = NULL; // prevent freeing entry.
> + if (pte_tlb_inserted(old_pte)) {
> + // Other thread else removed the tlb_track_entry after we got old_pte
> + // before we got spin lock.
> + ret = TLB_TRACK_AGAIN;
> + tlb_track->stat.iod_again++;
> + goto out;
> + }
> + if (new_entry == NULL && bit_to_be_set == _PAGE_TLB_INSERTED) {
> + spin_unlock(&tlb_track->hash_lock);
> + new_entry = tlb_track_get_entry(tlb_track);
> + if (new_entry == NULL) {
> + tlb_track_printd("get_entry failed\n");
> + // entry can't be allocated.
> + // fall down into full flush mode.
> + bit_to_be_set |= _PAGE_TLB_INSERTED_MANY;
> + tlb_track->stat.iod_new_failed++;
> + }
> + //tlb_track_printd("new_entry 0x%p\n", new_entry);
> + tlb_track->stat.iod_new_entry++;
> + goto again;
> + }
> +
> + BUG_ON(pte_tlb_inserted_many(old_pte));
> + new_pte = __pte(pte_val(old_pte) | bit_to_be_set);
> + ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
> + if (pte_val(old_pte) != pte_val(ret_pte)) {
> + if (tlb_track_pte_zapped(old_pte, ret_pte)) {
> + //tlb_track_printd("zapped TLB_TRACK_AGAIN\n");
> + ret = TLB_TRACK_AGAIN;
> + tlb_track->stat.iod_again++;
> + goto out;
> + }
> +
> + // Other thread set _PAGE_TLB_INSERTED and/or _PAGE_TLB_INSERTED_MANY
> + if (pte_tlb_inserted_many(ret_pte)) {
> + // Other thread already set _PAGE_TLB_INSERTED_MANY and
> + // removed the entry.
> + //tlb_track_printd("iserted TLB_TRACK_MANY\n");
> + BUG_ON(!pte_tlb_inserted(ret_pte));
> + ret = TLB_TRACK_MANY;
> + tlb_track->stat.iod_new_many++;
> + goto out;
> + }
> + BUG_ON(pte_tlb_inserted(ret_pte));
> + BUG();
> + }
> + if (new_entry) {
> + //tlb_track_printd("iserting new_entry 0x%p\n", new_entry);
> + entry = new_entry;
> + new_entry = NULL;
> +
> + entry->ptep = ptep;
> + entry->pte_val = old_pte;
> + entry->vaddr = vaddr;
> + entry->rid = rid;
> + cpus_clear(entry->pcpu_dirty_mask);
> + vcpus_clear(entry->vcpu_dirty_mask);
> + list_add(&entry->list, head);
> +
> +#ifdef CONFIG_TLB_TRACK_CNT
> + entry->cnt = 0;
> +#endif
> + tlb_track->stat.iod_insert++;
> + //tlb_track_entry_printf(entry);
> + } else {
> + goto out;
> + }
> +
> + found:
> + BUG_ON(v->processor >= NR_CPUS);
> + cpu_set(v->processor, entry->pcpu_dirty_mask);
> + BUG_ON(v->vcpu_id >= NR_CPUS);
> + vcpu_set(v->vcpu_id, entry->vcpu_dirty_mask);
> + tlb_track->stat.iod_dirtied++;
> +
> + out:
> + spin_unlock(&tlb_track->hash_lock);
> + if (ret == TLB_TRACK_MANY && entry != NULL) {
> + tlb_track_free_entry(tlb_track, entry);
> + }
> + if (new_entry != NULL) {
> + tlb_track_free_entry(tlb_track, new_entry);
> + }
> + return ret;
> +}
> +
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> + struct p2m_entry* entry)
> +{
> + unsigned long vrn = vaddr >> IA64_RR_SHIFT;
> + unsigned long rid = PSCB(vcpu, rrs[vrn]);
> + TLB_TRACK_RET_T ret;
> +
> + vaddr = (vaddr << 3) >> 3;// mask rid bit
> + vaddr &= PAGE_MASK;
> + ret = tlb_track_insert_or_dirty(vcpu->domain->arch.tlb_track,
> + &vcpu->domain->arch.mm,
> + entry->ptep, entry->used,
> + vaddr, rid);
> + if (ret == TLB_TRACK_AGAIN) {
> + p2m_entry_set_retry(entry);
> + }
> +}
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track,
> + volatile pte_t* ptep, pte_t old_pte,
> + struct tlb_track_entry** entryp)
> +{
> + unsigned long mfn = pte_pfn(old_pte);
> + struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> + struct tlb_track_entry* entry;
> +
> + tlb_track->stat.sar++;
> + if (!pte_tlb_tracking(old_pte)) {
> + tlb_track->stat.sar_not_tracked++;
> + return TLB_TRACK_NOT_TRACKED;
> + }
> + if (!pte_tlb_inserted(old_pte)) {
> + BUG_ON(pte_tlb_inserted_many(old_pte));
> + tlb_track->stat.sar_not_found++;
> + return TLB_TRACK_NOT_FOUND;
> + }
> + if (pte_tlb_inserted_many(old_pte)) {
> + BUG_ON(!pte_tlb_inserted(old_pte));
> + tlb_track->stat.sar_many++;
> + return TLB_TRACK_MANY;
> + }
> +
> + spin_lock(&tlb_track->hash_lock);
> + list_for_each_entry(entry, head, list) {
> + if (entry->ptep != ptep) {
> + continue;
> + }
> + if (pte_pfn(entry->pte_val) == mfn) {
> + list_del(&entry->list);
> + tlb_track->stat.sar_found++;
> + spin_unlock(&tlb_track->hash_lock);
> + *entryp = entry;
> + //tlb_track_entry_printf(entry);
> +#ifdef CONFIG_TLB_TRACK_CNT
> + //tlb_track_printd("cnt = %ld\n", entry->cnt);
> +#endif
> + return TLB_TRACK_FOUND;
> + }
> + BUG();
> + }
> + BUG();
> + spin_unlock(&tlb_track->hash_lock);
> + return TLB_TRACK_NOT_TRACKED;
> +}
> +
> +void
> +tlb_track_stat_printf(const struct tlb_track_stat* stat)
> +{
> + printk("iod %ld\n"
> + "iod_again %ld\n"
> + "iod_not_tracked %ld\n"
> + "iod_force_many %ld\n"
> + "iod_tracked_many %ld\n"
> + "iod_tracked_many_del %ld\n"
> + "iod_found %ld\n"
> + "iod_new_entry %ld\n"
> + "iod_new_failed %ld\n"
> + "iod_new_many %ld\n"
> + "iod_insert %ld\n"
> + "iod_dirtied %ld\n"
> + "sar %ld\n"
> + "sar_not_tracked %ld\n"
> + "sar_not_found %ld\n"
> + "sar_found %ld\n"
> + "sar_many %ld\n",
> + stat->iod,
> + stat->iod_again,
> + stat->iod_not_tracked,
> + stat->iod_force_many,
> + stat->iod_tracked_many,
> + stat->iod_tracked_many_del,
> + stat->iod_found,
> + stat->iod_new_entry,
> + stat->iod_new_failed,
> + stat->iod_new_many,
> + stat->iod_insert,
> + stat->iod_dirtied,
> + stat->sar,
> + stat->sar_not_tracked,
> + stat->sar_not_found,
> + stat->sar_found,
> + stat->sar_many);
> +}
> +
> +// for debug
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> + const struct tlb_track_entry* entry)
> +{
> + char pcpumask_buf[NR_CPUS + 1];
> + char vcpumask_buf[MAX_VIRT_CPUS + 1];
> + cpumask_scnprintf(pcpumask_buf, sizeof(pcpumask_buf),
> + entry->pcpu_dirty_mask);
> + vcpumask_scnprintf(vcpumask_buf, sizeof(vcpumask_buf),
> + entry->vcpu_dirty_mask);
> + printk("%s:%d\n"
> + "\tmfn 0x%016lx\n"
> + "\told_pte 0x%016lx ptep 0x%p\n"
> + "\tpte_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> + "\tpcpu_dirty_mask %s vcpu_dirty_mask %s\n"
> + "\tentry 0x%p\n",
> + func, line,
> + pte_pfn(entry->pte_val),
> + pte_val(entry->pte_val), entry->ptep, pte_val(*entry->ptep),
> + entry->vaddr, entry->rid,
> + pcpumask_buf, vcpumask_buf,
> + entry);
> +}
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/p2m_entry.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/p2m_entry.h Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,76 @@
> +/******************************************************************************
> + * p2m_entry.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + * VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + *
> + */
> +
> +#ifndef __ASM_P2M_ENTRY_H__
> +#define __ASM_P2M_ENTRY_H__
> +
> +#include <asm/tlb_track.h>
> +
> +struct p2m_entry {
> +#define P2M_PTE_ALWAYS_RETRY ((volatile pte_t*) -1)
> + volatile pte_t* ptep;
> + pte_t used;
> +};
> +
> +static inline void
> +p2m_entry_set(struct p2m_entry* entry, volatile pte_t* ptep, pte_t used)
> +{
> + entry->ptep = ptep;
> + entry->used = used;
> +}
> +
> +static inline void
> +p2m_entry_set_retry(struct p2m_entry* entry)
> +{
> + entry->ptep = P2M_PTE_ALWAYS_RETRY;
> +}
> +
> +static inline int
> +p2m_entry_retry(struct p2m_entry* entry)
> +{
> + //XXX see lookup_domain_pte().
> + // NULL is set for invalid gpaddr for the time being.
> + if (entry->ptep == NULL)
> + return 0;
> +
> + if (entry->ptep == P2M_PTE_ALWAYS_RETRY)
> + return 1;
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> + return ((pte_val(*entry->ptep) & ~_PAGE_TLB_TRACK_MASK) !=
> + (pte_val(entry->used) & ~_PAGE_TLB_TRACK_MASK));
> +#else
> + return (pte_val(*entry->ptep) != pte_val(entry->used));
> +#endif
> +}
> +
> +#endif // __ASM_P2M_ENTRY_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlb_track.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/tlb_track.h Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,201 @@
> +/******************************************************************************
> + * tlb_track.c
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + * VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + *
> + */
> +
> +#ifndef __TLB_TRACK_H__
> +#define __TLB_TRACK_H__
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +
> +#include <asm/domain.h>
> +#include <xen/list.h>
> +
> +#define _PAGE_TLB_TRACKING_BIT 53
> +#define _PAGE_TLB_INSERTED_BIT 54
> +#define _PAGE_TLB_INSERTED_MANY_BIT 55
> +
> +#define _PAGE_TLB_TRACKING (1UL << _PAGE_TLB_TRACKING_BIT)
> +#define _PAGE_TLB_INSERTED (1UL << _PAGE_TLB_INSERTED_BIT)
> +#define _PAGE_TLB_INSERTED_MANY (1UL << _PAGE_TLB_INSERTED_MANY_BIT)
> +#define _PAGE_TLB_TRACK_MASK (_PAGE_TLB_TRACKING |
> _PAGE_TLB_INSERTED | _PAGE_TLB_INSERTED_MANY)
> +
> +#define pte_tlb_tracking(pte) \
> + ((pte_val(pte) & _PAGE_TLB_TRACKING) != 0)
> +#define pte_tlb_inserted(pte) \
> + ((pte_val(pte) & _PAGE_TLB_INSERTED) != 0)
> +#define pte_tlb_inserted_many(pte) \
> + ((pte_val(pte) & _PAGE_TLB_INSERTED_MANY) != 0)
> +
> +
> +// vcpu mask
> +// stolen from cpumask.h
> +typedef struct { DECLARE_BITMAP(bits, MAX_VIRT_CPUS); } vcpumask_t;
> +
> +#define vcpu_set(vcpu, dst) __vcpu_set((vcpu), &(dst))
> +static inline void __vcpu_set(int vcpu, volatile vcpumask_t *dstp)
> +{
> + set_bit(vcpu, dstp->bits);
> +}
> +#define vcpus_clear(dst) __vcpus_clear(&(dst), MAX_VIRT_CPUS)
> +static inline void __vcpus_clear(vcpumask_t *dstp, int nbits)
> +{
> + bitmap_zero(dstp->bits, nbits);
> +}
> +/* No static inline type checking - see Subtlety (1) above. */
> +#define vcpu_isset(vcpu, vcpumask) test_bit((vcpu), (vcpumask).bits)
> +
> +#define vcpumask_scnprintf(buf, len, src) \
> + __vcpumask_scnprintf((buf), (len), &(src),
> MAX_VIRT_CPUS)
> +static inline int __vcpumask_scnprintf(char *buf, int len,
> + const vcpumask_t *srcp, int nbits)
> +{
> + return bitmap_scnprintf(buf, len, srcp->bits, nbits);
> +}
> +
> +
> +// TODO: compact this structure.
> +struct tlb_track_entry {
> + struct list_head list;
> +
> +
> + volatile pte_t* ptep; // corresponding p2m entry
> +
> + //XXX should we use TR_ENTRY?
> + pte_t pte_val; // mfn and other flags
> + // pte_val.p = 1:
> + // tlb entry is inserted.
> + // pte_val.p = 0:
> + // once tlb entry is inserted, so
> + // this entry is created. But tlb
> + // purge is isseued, so this
> + // virtual address need not to be
> + // purged.
> + unsigned long vaddr; // virtual address
> + unsigned long rid; // rid
> +
> + cpumask_t pcpu_dirty_mask;
> + vcpumask_t vcpu_dirty_mask;
> + // tlbflush_timestamp;
> +
> +#define CONFIG_TLB_TRACK_CNT
> +#ifdef CONFIG_TLB_TRACK_CNT
> +#define TLB_TRACK_CNT_FORCE_MANY 256 //XXX how many?
> + unsigned long cnt;
> +#endif
> +};
> +
> +struct tlb_track_stat {
> + // insert or dirty
> + unsigned long iod;
> + unsigned long iod_again;
> + unsigned long iod_not_tracked;
> + unsigned long iod_force_many;
> + unsigned long iod_tracked_many;
> + unsigned long iod_tracked_many_del;
> + unsigned long iod_found;
> + unsigned long iod_new_entry;
> + unsigned long iod_new_failed;
> + unsigned long iod_new_many;
> + unsigned long iod_insert;
> + unsigned long iod_dirtied;
> +
> + // search and remove
> + unsigned long sar;
> + unsigned long sar_not_tracked;
> + unsigned long sar_not_found;
> + unsigned long sar_found;
> + unsigned long sar_many;
> +};
> +void tlb_track_stat_printf(const struct tlb_track_stat* stat);
> +
> +struct tlb_track {
> +
> +// see __gnttab_map_grant_ref()
> +// A domain can map granted-page up to MAPTRACK_MAX_ENTRIES pages.
> +#define TLB_TRACK_LIMIT_ENTRIES \
> + (MAPTRACK_MAX_ENTRIES * (PAGE_SIZE / sizeof(struct tlb_track)))
> +
> + spinlock_t free_list_lock;
> + struct list_head free_list;
> + unsigned int limit;
> + unsigned int num_entries;
> + unsigned int num_free;
> + struct list_head page_list;
> +
> + // XXX hash table size
> + spinlock_t hash_lock;
> + unsigned int hash_size;
> + unsigned int hash_shift;
> + unsigned int hash_mask;
> + struct list_head* hash;
> +
> + struct tlb_track_stat stat;
> +};
> +
> +int tlb_track_create(struct domain* d);
> +void tlb_track_destroy(struct domain* d);
> +
> +void tlb_track_free_entry(struct tlb_track* tlb_track,
> + struct tlb_track_entry* entry);
> +
> +struct p2m_entry;
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> + struct p2m_entry* entry);
> +
> +// return value
> +// NULL if this entry is used
> +// entry if this entry isn't used
> +enum TLB_TRACK_RET {
> + TLB_TRACK_NOT_TRACKED,
> + TLB_TRACK_NOT_FOUND,
> + TLB_TRACK_FOUND,
> + TLB_TRACK_MANY,
> + TLB_TRACK_AGAIN,
> +};
> +typedef enum TLB_TRACK_RET TLB_TRACK_RET_T;
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track,
> + volatile pte_t* ptep, pte_t old_pte,
> + struct tlb_track_entry** entryp);
> +
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> + const struct tlb_track_entry* entry);
> +#define tlb_track_entry_printf(entry) \
> + __tlb_track_entry_printf(__func__, __LINE__, (entry))
> +#else
> +//define nop
> +
> +#endif // CONFIG_XEN_IA64_TLB_TRACK
> +
> +#endif // __TLB_TRACK_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID a56d48066373c9fe317e986580c08394fe89fc7e
> # Parent cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> implement per vcpu vhpt option. allocate VHPT per vcpu.
> added compile time option, xen_ia64_pervcpu_vhpt=y, to enable it.
> added xen boot time option, pervcpu_vhpt=0, to disable it.
> This patch depends on tlb tracking patch.
> PATCHNAME: pervcpu_vhpt
>
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
>
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk Mon Jul 24 21:37:15 2006 +0900
> @@ -42,6 +42,9 @@ ifeq ($(xen_ia64_tlb_track),y)
> ifeq ($(xen_ia64_tlb_track),y)
> CFLAGS += -DCONFIG_XEN_IA64_TLB_TRACK
> endif
> +ifeq ($(xen_ia64_pervcpu_vhpt),y)
> +CFLAGS += -DCONFIG_XEN_IA64_PERVCPU_VHPT
> +endif
> ifeq ($(no_warns),y)
> CFLAGS += -Wa,--fatal-warnings -Werror -Wno-uninitialized
> endif
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c Mon Jul 24 21:37:15 2006 +0900
> @@ -117,8 +117,12 @@ static void flush_vtlb_for_context_switc
> if (VMX_DOMAIN(vcpu)) {
> // currently vTLB for vt-i domian is per vcpu.
> // so any flushing isn't needed.
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + } else if (HAS_PERVCPU_VHPT(v->domain)) {
> + // nothing to do
> +#endif
> } else {
> - vhpt_flush();
> + local_vhpt_flush();
> }
> local_flush_tlb_all();
> }
> @@ -133,9 +137,13 @@ void schedule_tail(struct vcpu *prev)
> vmx_do_launch(current);
> } else {
> ia64_set_iva(&ia64_ivt);
> - ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> - VHPT_ENABLED);
> + // disable VHPT. ia64_new_rr7() might cause VHPT
> + // fault without this because it flushes dtr[IA64_TR_VHPT]
> + // (VHPT_SIZE_LOG2 << 2) is just for avoid
> + // Reserved Register/Field fault.
> + ia64_set_pta(VHPT_SIZE_LOG2 << 2);
> load_region_regs(current);
> + ia64_set_pta(vcpu_pta(current));
> vcpu_load_kernel_regs(current);
> __ia64_per_cpu_var(current_psr_i_addr) = ¤t->domain->
> shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
> @@ -186,9 +194,13 @@ if (!i--) { i = 1000000; printk("+"); }
>
> nd = current->domain;
> if (!is_idle_domain(nd)) {
> - ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> - VHPT_ENABLED);
> + // disable VHPT. ia64_new_rr7() might cause VHPT
> + // fault without this because it changes dtr[IA64_TR_VHPT]
> + // (VHPT_SIZE_LOG2 << 2) is just for avoid
> + // Reserved Register/Field fault.
> + ia64_set_pta(VHPT_SIZE_LOG2 << 2);
> load_region_regs(current);
> + ia64_set_pta(vcpu_pta(current));
> vcpu_load_kernel_regs(current);
> vcpu_set_next_timer(current);
> if (vcpu_timer_expired(current))
> @@ -305,6 +317,17 @@ struct vcpu *alloc_vcpu_struct(struct do
> v->arch.ending_rid = d->arch.ending_rid;
> v->arch.breakimm = d->arch.breakimm;
> v->arch.last_processor = INVALID_PROCESSOR;
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(d)) {
> + if (pervcpu_vhpt_alloc(v) < 0) {
> + free_xenheap_pages(v->arch.privregs,
> + get_order(sizeof(mapped_regs_t)));
> + free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
> + return NULL;
> + }
> + }
> +#endif
> }
>
> return v;
> @@ -315,6 +338,10 @@ void free_vcpu_struct(struct vcpu *v)
> if (VMX_DOMAIN(v))
> vmx_relinquish_vcpu_resources(v);
> else {
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(v->domain))
> + pervcpu_vhpt_free(v);
> +#endif
> if (v->arch.privregs != NULL)
> free_xenheap_pages(v->arch.privregs,
> get_order_from_shift(XMAPPEDREGS_SHIFT));
> @@ -340,6 +367,11 @@ static void init_switch_stack(struct vcp
> memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
> }
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static int opt_pervcpu_vhpt = 1;
> +integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
> +#endif
> +
> int arch_domain_create(struct domain *d)
> {
> int i;
> @@ -354,6 +386,13 @@ int arch_domain_create(struct domain *d)
> if (is_idle_domain(d))
> return 0;
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
> +#if 1
> + DPRINTK("%s:%d domain %d pervcpu_vhpt %d\n",
> + __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
> +#endif
> +#endif
> #ifdef CONFIG_XEN_IA64_TLB_TRACK
> if (tlb_track_create(d) < 0)
> goto fail_nomem;
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/regionreg.c
> --- a/xen/arch/ia64/xen/regionreg.c Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/regionreg.c Mon Jul 24 21:37:15 2006 +0900
> @@ -260,7 +260,7 @@ int set_one_rr(unsigned long rr, unsigne
> } else if (rreg == 7) {
> ia64_new_rr7(vmMangleRID(newrrv.rrval),v->domain->shared_info,
> v->arch.privregs, v->domain->arch.shared_info_va,
> - __get_cpu_var(vhpt_paddr));
> + vcpu_vhpt_maddr(v));
> } else {
> set_rr(rr,newrrv.rrval);
> }
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c Mon Jul 24 21:37:15 2006 +0900
> @@ -23,18 +23,30 @@ DEFINE_PER_CPU (unsigned long, vhpt_padd
> DEFINE_PER_CPU (unsigned long, vhpt_paddr);
> DEFINE_PER_CPU (unsigned long, vhpt_pend);
>
> -void vhpt_flush(void)
> -{
> - struct vhpt_lf_entry *v = __va(__ia64_per_cpu_var(vhpt_paddr));
> +static void __vhpt_flush(unsigned long vhpt_maddr)
> +{
> + struct vhpt_lf_entry *v =(struct vhpt_lf_entry*)__va(vhpt_maddr);
> int i;
>
> for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
> v->ti_tag = INVALID_TI_TAG;
> }
>
> -static void vhpt_erase(void)
> -{
> - struct vhpt_lf_entry *v = (struct vhpt_lf_entry *)VHPT_ADDR;
> +void local_vhpt_flush(void)
> +{
> + __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr));
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush(struct vcpu* v)
> +{
> + __vhpt_flush(vcpu_vhpt_maddr(v));
> +}
> +#endif
> +
> +static void vhpt_erase(unsigned long vhpt_maddr)
> +{
> + struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
> int i;
>
> for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++) {
> @@ -44,17 +56,6 @@ static void vhpt_erase(void)
> v->ti_tag = INVALID_TI_TAG;
> }
> // initialize cache too???
> -}
> -
> -
> -static void vhpt_map(unsigned long pte)
> -{
> - unsigned long psr;
> -
> - psr = ia64_clear_ic();
> - ia64_itr(0x2, IA64_TR_VHPT, VHPT_ADDR, pte, VHPT_SIZE_LOG2);
> - ia64_set_psr(psr);
> - ia64_srlz_i();
> }
>
> void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long logps)
> @@ -101,7 +102,7 @@ void vhpt_multiple_insert(unsigned long
>
> void vhpt_init(void)
> {
> - unsigned long paddr, pte;
> + unsigned long paddr;
> struct page_info *page;
> #if !VHPT_ENABLED
> return;
> @@ -121,13 +122,54 @@ void vhpt_init(void)
> __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
> printf("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
> paddr, __get_cpu_var(vhpt_pend));
> - pte = pte_val(pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL));
> - vhpt_map(pte);
> - ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> - VHPT_ENABLED);
> - vhpt_erase();
> -}
> -
> + vhpt_erase(paddr);
> + // we don't enable VHPT here.
> + // context_switch() or schedule_tail() does it.
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +int
> +pervcpu_vhpt_alloc(struct vcpu *v)
> +{
> + unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> + DPRINTK("%s:%d allocating d 0x%p %d v 0x%p %d\n",
> + __func__, __LINE__,
> + v->domain, v->domain->domain_id,
> + v, v->vcpu_id);
> +
> + v->arch.vhpt_entries =
> + (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
> + v->arch.vhpt_page =
> + alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
> + if (!v->arch.vhpt_page)
> + return -ENOMEM;
> +
> + v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
> + if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
> + panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
> +
> + v->arch.pta.val = 0; // zero clear
> + v->arch.pta.ve = 1; // enable vhpt
> + v->arch.pta.size = VHPT_SIZE_LOG2;
> + v->arch.pta.vf = 1; // long format
> + v->arch.pta.base = v->arch.vhpt_maddr >> 15;
> +
> + vhpt_erase(v->arch.vhpt_maddr);
> + return 0;
> +}
> +
> +void
> +pervcpu_vhpt_free(struct vcpu *v)
> +{
> + unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> + DPRINTK("%s:%d freeing d 0x%p %d v 0x%p %d\n",
> + __func__, __LINE__,
> + v->domain, v->domain->domain_id,
> + v, v->vcpu_id);
> +
> + free_domheap_pages(v->arch.vhpt_page, vhpt_size_log2 - PAGE_SHIFT);
> +}
> +#endif
>
> void vcpu_flush_vtlb_all(struct vcpu *v)
> {
> @@ -136,7 +178,15 @@ void vcpu_flush_vtlb_all(struct vcpu *v)
> vcpu_purge_tr_entry(&PSCBX(v,itlb));
>
> /* Then VHPT. */
> - vhpt_flush ();
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(v->domain.arch)) {
> + vcpu_vhpt_flush(v);
> + } else {
> + local_vhpt_flush();
> + }
> +#else
> + local_vhpt_flush();
> +#endif
>
> /* Then mTLB. */
> local_flush_tlb_all ();
> @@ -169,9 +219,10 @@ void domain_flush_vtlb_all (void)
> }
> }
>
> -static void cpu_flush_vhpt_range (int cpu, u64 vadr, u64 addr_range)
> -{
> - void *vhpt_base = __va(per_cpu(vhpt_paddr, cpu));
> +static void __flush_vhpt_range(unsigned long vhpt_maddr,
> + u64 vadr, u64 addr_range)
> +{
> + void *vhpt_base = __va(vhpt_maddr);
>
> while ((long)addr_range > 0) {
> /* Get the VHPT entry. */
> @@ -184,9 +235,30 @@ static void cpu_flush_vhpt_range (int cp
> }
> }
>
> +static void cpu_vhpt_flush_range(int cpu, u64 vadr, u64 addr_range)
> +{
> + __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush_range(struct vcpu* v, u64 vadr, u64 addr_range)
> +{
> + __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
> +}
> +#endif
> +
> void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
> {
> - cpu_flush_vhpt_range (current->processor, vadr, 1UL << log_range);
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(current->domain.arch)) {
> + vcpu_vhpt_flush_range(current, vadr, 1UL << log_range);
> + } else {
> + cpu_vhpt_flush_range(current->processor,
> + vadr, 1UL << log_range);
> + }
> +#else
> + cpu_vhpt_flush_range(current->processor, vadr, 1UL << log_range);
> +#endif
> ia64_ptcl(vadr, log_range << 2);
> ia64_srlz_i();
> }
> @@ -218,8 +290,17 @@ void domain_flush_vtlb_range (struct dom
> if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> continue;
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(d->arch)) {
> + vcpu_vhpt_flush_range(v, vadr, addr_range);
> + } else {
> + /* Invalidate VHPT entries. */
> + cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> + }
> +#else
> /* Invalidate VHPT entries. */
> - cpu_flush_vhpt_range (v->processor, vadr, addr_range);
> + cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> +#endif
> }
> // ptc.ga has release semantics.
>
> @@ -254,11 +335,30 @@ domain_flush_vltb_track_entry(struct dom
> }
> smp_mb();
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(d->arch)) {
> + for_each_vcpu(d, v) {
> + if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> + continue;
> + if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> + continue;
> +
> + /* Invalidate VHPT entries. */
> + vcpu_vhpt_flush_range(v, entry->vaddr, PAGE_SIZE);
> + }
> + } else {
> + for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> + /* Invalidate VHPT entries. */
> + cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> + }
> + }
> +#else
> for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
> /* Invalidate VHPT entries. */
> - cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> - }
> + cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> + }
> +#endif
> // ptc.ga has release semantics.
>
> /* ptc.ga */
> @@ -272,7 +372,7 @@ static void flush_tlb_vhpt_all (struct d
> static void flush_tlb_vhpt_all (struct domain *d)
> {
> /* First VHPT. */
> - vhpt_flush ();
> + local_vhpt_flush ();
>
> /* Then mTLB. */
> local_flush_tlb_all ();
> @@ -281,7 +381,14 @@ void domain_flush_destroy (struct domain
> void domain_flush_destroy (struct domain *d)
> {
> /* Very heavy... */
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(d->arch))
> + on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
> + else
> + on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#else
> on_each_cpu ((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#endif
> cpus_clear (d->domain_dirty_cpumask);
> }
>
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h Mon Jul 24 21:37:15 2006 +0900
> @@ -63,6 +63,9 @@ struct arch_domain {
> unsigned long flags;
> struct {
> unsigned int is_vti : 1;
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + unsigned int has_pervcpu_vhpt : 1;
> +#endif
> };
> };
>
> @@ -108,6 +111,13 @@ struct arch_domain {
> #define INT_ENABLE_OFFSET(v) \
> (sizeof(vcpu_info_t) * (v)->vcpu_id + \
> offsetof(vcpu_info_t, evtchn_upcall_mask))
> +
> +#ifdef CONFIG_XEN_IA64_PER_VCPU_VHPT
> +#define HAS_PERVCPU_VHPT(d) ((d)->has_pervcpu_vhpt)
> +#else
> +#define HAS_PERVCPU_VHPT(d) (0)
> +#endif
> +
>
> struct arch_vcpu {
> /* Save the state of vcpu.
> @@ -158,6 +168,13 @@ struct arch_vcpu {
> fpswa_ret_t fpswa_ret; /* save return values of FPSWA emulation */
> struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + PTA pta;
> + unsigned long vhpt_maddr;
> + struct page_info* vhpt_page;
> + unsigned long vhpt_entries;
> +#endif
> +
> #define INVALID_PROCESSOR INT_MAX
> int last_processor;
> };
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/vhpt.h
> --- a/xen/include/asm-ia64/vhpt.h Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/vhpt.h Mon Jul 24 21:37:15 2006 +0900
> @@ -42,11 +42,47 @@ extern void vhpt_multiple_insert(unsigne
> unsigned long logps);
> extern void vhpt_insert (unsigned long vadr, unsigned long pte,
> unsigned long logps);
> -void vhpt_flush(void);
> +void local_vhpt_flush(void);
>
> /* Currently the VHPT is allocated per CPU. */
> DECLARE_PER_CPU (unsigned long, vhpt_paddr);
> DECLARE_PER_CPU (unsigned long, vhpt_pend);
>
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +#if !VHPT_ENABLED
> +#error "VHPT_ENABLED must be set for CONFIG_XEN_IA64_PERVCPU_VHPT"
> +#endif
> +#include <xen/sched.h>
> +int pervcpu_vhpt_alloc(struct vcpu *v);
> +void pervcpu_vhpt_free(struct vcpu *v);
> +static inline unsigned long
> +vcpu_vhpt_maddr(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(v->domain)) {
> + return v->arch.vhpt_maddr;
> + }
> +#endif
> +
> +#if 0
> + // referencecing v->processor is racy.
> + return per_cpu(vhpt_paddr, v->processor);
> +#endif
> + BUG_ON(v != current);
> + return __get_cpu_var(vhpt_paddr);
> +}
> +
> +static inline unsigned long
> +vcpu_pta(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> + if (HAS_PERVCPU_VHPT(v->domain)) {
> + return v->arch.pta.val;
> + }
> +#endif
> + return VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) | VHPT_ENABLED;
> +}
> +#endif
> +
> #endif /* !__ASSEMBLY */
> #endif
> _______________________________________________
> Xen-ia64-devel mailing list
> Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-ia64-devel
--
yamahata
10701:3cee9325a6c6_import_linux_hash.h.patch
Description: Text document
10702:b90fff753ca1_tlb_track.patch
Description: Text document
10703:f9b91b850f7b_pervcpu_vhpt.patch
Description: Text document
_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ia64-devel
|