# HG changeset patch
# User root@iwillnow.amd.com
# Node ID 502810b11c7972aabebd8e898e45f63189a493e5
# Parent  d39e8c44da34ac257b26cce29122852ef3a930cc
Add support for the Opteron AGP GART/aperture as a simplified IOMMU

diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c	Fri Feb 09 16:32:04 2007 -0600
@@ -252,7 +252,7 @@ static void contiguous_bitmap_clear(
 }
 
 /* Protected by balloon_lock. */
-#define MAX_CONTIG_ORDER 9 /* 2MB */
+#define MAX_CONTIG_ORDER 16 /* 256MB */
 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
 static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
 
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/Kconfig
--- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Fri Feb 09 16:32:04 2007 -0600
@@ -434,7 +434,7 @@ config IOMMU
 	default y
 	select SWIOTLB
 	select AGP
-	depends on PCI && !X86_64_XEN
+	depends on PCI
 	help
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Fri Feb 09 16:32:04 2007 -0600
@@ -59,7 +59,7 @@ alternative-y			+= ../../i386/kernel/alt
 
 ifdef CONFIG_XEN
 time-y				+= ../../i386/kernel/time-xen.o
-pci-dma-y			+= ../../i386/kernel/pci-dma-xen.o
+pci-dma-y			+= pci-dma-xen.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 intel_cacheinfo-y		:= ../../i386/kernel/cpu/intel_cacheinfo-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -97,7 +97,6 @@ static inline int bad_addr(unsigned long
 	return 0;
 } 
 
-#ifndef CONFIG_XEN
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
@@ -116,7 +115,6 @@ e820_any_mapped(unsigned long start, uns
 	} 
 	return 0;
 }
-#endif
 
 /*
  * This function checks if the entire range <start,end> is mapped with type.
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -369,7 +369,7 @@ void __init check_ioapic(void)
 				switch (vendor) { 
 				case PCI_VENDOR_ID_VIA:
 #ifdef CONFIG_IOMMU
-					if ((end_pfn > MAX_DMA32_PFN ||
+					if ((global_need_iommu() ||
 					     force_iommu) &&
 					    !iommu_aperture_allowed) {
 						printk(KERN_INFO
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -8,13 +8,10 @@
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 
-#if 0
 int swiotlb __read_mostly;
 EXPORT_SYMBOL(swiotlb);
-#endif
 
 struct dma_mapping_ops swiotlb_dma_ops = {
-#if 0
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
@@ -29,14 +26,12 @@ struct dma_mapping_ops swiotlb_dma_ops =
 	.map_sg = swiotlb_map_sg,
 	.unmap_sg = swiotlb_unmap_sg,
 	.dma_supported = NULL,
-#endif
 };
 
 void pci_swiotlb_init(void)
 {
-#if 0
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+	if (!iommu_detected && !no_iommu)
 	       swiotlb = 1;
 	if (swiotlb_force)
 		swiotlb = 1;
@@ -45,11 +40,4 @@ void pci_swiotlb_init(void)
 		swiotlb_init();
 		dma_ops = &swiotlb_dma_ops;
 	}
-#else
-	swiotlb_init();
-	if (swiotlb) {
-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-		dma_ops = &swiotlb_dma_ops;
-	}
-#endif
 }
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -844,6 +844,7 @@ void __init paging_init(void)
 }
 #endif
 
+#ifndef CONFIG_XEN
 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
    from the CPU leading to inconsistent cache lines. address and size
    must be aligned to 2MB boundaries. 
@@ -877,6 +878,39 @@ void __init clear_kernel_mapping(unsigne
 	}
 	__flush_tlb_all();
 } 
+#else
+/* In Xen the direct mapping doesn't use super pages. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+	unsigned long end = address + size;
+
+	BUG_ON(address & ~PAGE_MASK);
+	BUG_ON(size & ~PAGE_MASK); 
+
+	for (; address < end; address += PAGE_SIZE) { 
+		pgd_t *pgd = pgd_offset_k(address);
+		pud_t *pud;
+		pmd_t *pmd;
+		pte_t *pte;
+
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud))
+			continue; 
+		pmd = pmd_offset(pud, address);
+		if (!pmd || pmd_none(*pmd))
+			continue; 
+		pte = pte_offset_map(pmd, address);
+		if (!pte || pte_none(*pte))
+			continue;
+
+		pte_clear(&init_mm,address,pte);
+
+	}
+	__flush_tlb_all();
+} 
+#endif
 
 /*
  * Memory hotplug specific functions
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h	Fri Feb 09 16:32:04 2007 -0600
@@ -62,7 +62,12 @@ static inline int valid_dma_direction(in
 		(dma_direction == DMA_FROM_DEVICE));
 }
 
-#if 0
+#ifdef CONFIG_XEN
+#define global_need_iommu() 1
+#else
+#define global_need_iommu() (HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL)>MAX_DMA32_PFN)
+#endif
+
 static inline int dma_mapping_error(dma_addr_t dma_addr)
 {
 	if (dma_ops->mapping_error)
@@ -200,8 +205,5 @@ dma_cache_sync(void *vaddr, size_t size,
 
 extern struct device fallback_dev;
 extern int panic_on_overflow;
-#endif
 
 #endif /* _X8664_DMA_MAPPING_H */
-
-#include <asm-i386/mach-xen/asm/dma-mapping.h>
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/Makefile
--- a/linux-2.6-xen-sparse/lib/Makefile	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/lib/Makefile	Fri Feb 09 16:32:04 2007 -0600
@@ -51,8 +51,7 @@ obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 
-obj-$(CONFIG_SWIOTLB) += swiotlb.o
-swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
+obj-$(CONFIG_SWIOTLB) += swiotlb-xen.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff -r d39e8c44da34 -r 502810b11c79 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/xen/arch/x86/mm.c	Fri Feb 09 16:32:04 2007 -0600
@@ -2094,6 +2094,12 @@ int do_mmuext_op(
                 }
             }
             break;
+
+	case MMUEXT_UNMAP_REGION:
+            map_pages_to_xen(
+                PAGE_OFFSET + (op.arg1.mfn << PAGE_SHIFT),
+                op.arg1.mfn, op.arg2.nr_ents, PAGE_HYPERVISOR & ~_PAGE_PRESENT);
+	    break;
 #endif
         
         case MMUEXT_TLB_FLUSH_LOCAL:
diff -r d39e8c44da34 -r 502810b11c79 xen/include/public/xen.h
--- a/xen/include/public/xen.h	Fri Feb 09 10:48:41 2007 +0000
+++ b/xen/include/public/xen.h	Fri Feb 09 16:32:04 2007 -0600
@@ -188,6 +188,10 @@
  * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
  * mfn: Machine frame number of new page-table base to install in MMU
  *      when in user space.
+ *
+ * cmd: MMUEXT_UNMAP_REGION [x86/64 only]
+ * mfn: First machine frame number to unmap from the hypervisor's virtual memory.
+ * nr_ents: Number of mfns to unmap.
  * 
  * cmd: MMUEXT_TLB_FLUSH_LOCAL
  * No additional arguments. Flushes local TLB.
@@ -230,6 +234,7 @@
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_UNMAP_REGION     16
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,284 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+
+int fix_aperture __initdata = 1;
+
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
+
+static u32 __init allocate_aperture(void) 
+{
+	pg_data_t *nd0 = NODE_DATA(0);
+	u32 aper_size;
+	void *p; 
+
+	if (fallback_aper_order > 7) 
+		fallback_aper_order = 7; 
+	aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+
+	/* 
+	 * Aperture has to be naturally aligned. This means an 2GB aperture won't
+	 * have much chances to find a place in the lower 4GB of memory.
+	 * Unfortunately we cannot move it up because that would make the
+	 * IOMMU useless.
+	 */
+	p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
+#ifndef CONFIG_XEN
+	if (!p || __pa(p)+aper_size > 0xffffffff) {
+#else
+	if (!p || (xen_create_contiguous_region((unsigned long)p, get_order(aper_size), 31) != 0)||virt_to_bus(p)+aper_size > 0xffffffff) {
+#endif
+		printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+		       p, aper_size>>10);
+		if (p)
+			free_bootmem_node(nd0, __pa(p), aper_size); 
+		return 0;
+	}
+	printk("Mapping aperture over %d KB of RAM @ %lx\n",
+	       aper_size >> 10, virt_to_bus(p)); 
+	return (u32)virt_to_bus(p); 
+}
+
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
+{ 
+	if (!aper_base) 
+		return 0;
+	if (aper_size < 64*1024*1024) { 
+		printk("Aperture too small (%d MB)\n", aper_size>>20);
+		return 0;
+	}
+	if (aper_base + aper_size >= 0xffffffff) { 
+		printk("Aperture beyond 4GB. Ignoring.\n");
+		return 0; 
+	}
+	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+		printk("Aperture pointing to e820 RAM. Ignoring.\n");
+		return 0; 
+	} 
+	return 1;
+} 
+
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+	u8 pos;
+	int bytes;
+	if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+		return 0;
+	pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+		u8 id;
+		pos &= ~3; 
+		id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap) 
+			return pos; 
+		pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+	} 
+	return 0;
+} 
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+	u32 apsize;
+	u32 apsizereg;
+	int nbits;
+	u32 aper_low, aper_hi;
+	u64 aper;
+
+	printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+	apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+	if (apsizereg == 0xffffffff) {
+		printk("APSIZE in AGP bridge unreadable\n");
+		return 0;
+	}
+
+	apsize = apsizereg & 0xfff;
+	/* Some BIOS use weird encodings not in the AGPv3 table. */
+	if (apsize & 0xff) 
+		apsize |= 0xf00; 
+	nbits = hweight16(apsize);
+	*order = 7 - nbits;
+	if ((int)*order < 0) /* < 32MB */
+		*order = 0;
+	
+	aper_low = read_pci_config(num,slot,func, 0x10);
+	aper_hi = read_pci_config(num,slot,func,0x14);
+	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+	       aper, 32 << *order, apsizereg);
+
+	if (!aperture_valid(aper, (32*1024*1024) << *order))
+	    return 0;
+	return (u32)aper; 
+} 
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+	int num, slot, func;
+
+	/* Poor man's PCI discovery */
+	for (num = 0; num < 256; num++) { 
+		for (slot = 0; slot < 32; slot++) { 
+			for (func = 0; func < 8; func++) { 
+				u32 class, cap;
+				u8 type;
+				class = read_pci_config(num,slot,func,
+							PCI_CLASS_REVISION);
+				if (class == 0xffffffff)
+					break; 
+				
+				switch (class >> 16) { 
+				case PCI_CLASS_BRIDGE_HOST:
+				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+					/* AGP bridge? */
+					cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+					if (!cap)
+						break;
+					*valid_agp = 1; 
+					return read_agp(num,slot,func,cap,order);
+				} 
+				
+				/* No multi-function device? */
+				type = read_pci_config_byte(num,slot,func,
+							       PCI_HEADER_TYPE);
+				if (!(type & 0x80))
+					break;
+			} 
+		} 
+	}
+	printk("No AGP bridge found\n"); 
+	return 0;
+}
+
+void __init iommu_hole_init(void) 
+{ 
+	int fix, num; 
+	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+	u64 aper_base, last_aper_base = 0;
+	int valid_agp = 0;
+
+	if (iommu_aperture_disabled || !fix_aperture)
+		return;
+
+	printk("Checking aperture...\n"); 
+
+	fix = 0;
+	for (num = 24; num < 32; num++) {		
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
+
+		iommu_detected = 1;
+		iommu_aperture = 1; 
+
+		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+		aper_size = (32 * 1024 * 1024) << aper_order; 
+		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+		aper_base <<= 25; 
+
+		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+		       aper_base, aper_size>>20);
+		
+		if (!aperture_valid(aper_base, aper_size)) {
+			fix = 1; 
+			break; 
+		}
+
+		if ((last_aper_order && aper_order != last_aper_order) ||
+		    (last_aper_base && aper_base != last_aper_base)) {
+			fix = 1;
+			break;
+		}
+		last_aper_order = aper_order;
+		last_aper_base = aper_base;
+	} 
+
+	if (!fix && !fallback_aper_force) 
+		return; 
+
+	if (!fallback_aper_force)
+		aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+		
+	if (aper_alloc) { 
+		/* Got the aperture from the AGP bridge */
+	} else if (swiotlb && !valid_agp) {
+		/* Do nothing */
+	} else if ((!no_iommu && global_need_iommu()) ||
+		   force_iommu ||
+		   valid_agp ||
+		   fallback_aper_force) { 
+		printk("Your BIOS doesn't leave a aperture memory hole\n");
+		printk("Please enable the IOMMU option in the BIOS setup\n");
+		printk("This costs you %d MB of RAM\n",
+		       32 << fallback_aper_order);
+
+		aper_order = fallback_aper_order;
+		aper_alloc = allocate_aperture();
+		if (!aper_alloc) { 
+			/* Could disable AGP and IOMMU here, but it's probably
+			   not worth it. But the later users cannot deal with
+			   bad apertures and turning on the aperture over memory
+			   causes very strange problems, so it's better to 
+			   panic early. */
+			panic("Not enough memory for aperture");
+		}
+	} else { 
+		return; 
+	} 
+
+	/* Fix up the north bridges */
+	for (num = 24; num < 32; num++) { 		
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;	
+
+		/* Don't enable translation yet. That is done later. 
+		   Assume this BIOS didn't initialise the GART so 
+		   just overwrite all previous bits */ 
+		write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+		write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+	} 
+} 
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,328 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/calgary.h>
+#include <xen/balloon.h>
+#include <asm/tlbflush.h>
+
+int iommu_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_merge);
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+int iommu_sac_force __read_mostly = 0;
+EXPORT_SYMBOL(iommu_sac_force);
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly= 0;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = DMA_32BIT_MASK,
+	.dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+/* Allocate DMA memory on node near device */
+noinline static void *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+	struct page *page;
+	int node;
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		node = pcibus_to_node(to_pci_dev(dev)->bus);
+	else
+#endif
+		node = numa_node_id();
+
+	if (node < first_node(node_online_map))
+		node = first_node(node_online_map);
+
+	page = alloc_pages_node(node, gfp, order);
+	return page ? page_address(page) : NULL;
+}
+
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   gfp_t gfp)
+{
+	void *memory;
+	unsigned long dma_mask = 0;
+	u64 bus;
+
+	if (!dev)
+		dev = &fallback_dev;
+	dma_mask = dev->coherent_dma_mask;
+	if (dma_mask == 0)
+		dma_mask = DMA_32BIT_MASK;
+
+	/* Don't invoke OOM killer */
+	gfp |= __GFP_NORETRY;
+
+	/* Kludge to make it bug-to-bug compatible with i386. i386
+	   uses the normal dma_mask for alloc_coherent. */
+	dma_mask &= *dev->dma_mask;
+
+	/* Why <=? Even when the mask is smaller than 4GB it is often
+	   larger than 16MB and in this case we have a chance of
+	   finding fitting memory in the next higher zone first. If
+	   not retry with true GFP_DMA. -AK */
+	if (dma_mask <= DMA_32BIT_MASK)
+		gfp |= GFP_DMA32;
+
+ again:
+	memory = dma_alloc_pages(dev, gfp, get_order(size));
+	if (memory == NULL)
+		return NULL;
+
+	{
+		int high, mmu;
+		bus = virt_to_bus(memory);
+	        high = (bus + size) >= dma_mask;
+		mmu = high;
+		if (force_iommu && !(gfp & GFP_DMA))
+			mmu = 1;
+		else if (high) {
+			free_pages((unsigned long)memory,
+				   get_order(size));
+
+			/* Don't use the 16MB ZONE_DMA unless absolutely
+			   needed. It's better to use remapping first. */
+			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+				goto again;
+			}
+
+			/* Let low level make its own zone decisions */
+			gfp &= ~(GFP_DMA32|GFP_DMA);
+
+			if (dma_ops->alloc_coherent)
+				return dma_ops->alloc_coherent(dev, size,
+							   dma_handle, gfp);
+			return NULL;
+		}
+
+		/* Hardcode 31 address bits for now: aacraid limitation. */
+		if (xen_create_contiguous_region((unsigned long)memory, get_order(size), 31) != 0) {
+			free_pages((unsigned long)memory, get_order(size));
+			return NULL;
+		}
+
+		memset(memory, 0, size);
+		if (!mmu) {
+			*dma_handle = virt_to_bus(memory);
+			return memory;
+		}
+	}
+
+	if (dma_ops->alloc_coherent) {
+		free_pages((unsigned long)memory, get_order(size));
+		gfp &= ~(GFP_DMA|GFP_DMA32);
+		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+	}
+
+	if (dma_ops->map_simple) {
+		*dma_handle = dma_ops->map_simple(dev, memory,
+					      size,
+					      PCI_DMA_BIDIRECTIONAL);
+		if (*dma_handle != bad_dma_address)
+			return memory;
+	}
+
+	if (panic_on_overflow)
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+	free_pages((unsigned long)memory, get_order(size));
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t bus)
+{
+	if (dma_ops->unmap_single)
+		dma_ops->unmap_single(dev, bus, size, 0);
+	xen_destroy_contiguous_region((unsigned long) vaddr, get_order(size));
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	if (dma_ops->dma_supported)
+		return dma_ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+        if (mask < DMA_24BIT_MASK)
+                return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+	*dev->dma_mask = mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes)
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
+   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than merge,
+            but should be only done with very big IOMMUs. Implies merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default)
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft	 Use software bounce buffering (default for Intel machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{
+    iommu_merge = 1;
+
+    while (*p) {
+	    if (!strncmp(p,"off",3))
+		    no_iommu = 1;
+	    /* gart_parse_options has more force support */
+	    if (!strncmp(p,"force",5))
+		    force_iommu = 1;
+	    if (!strncmp(p,"noforce",7)) {
+		    iommu_merge = 0;
+		    force_iommu = 0;
+	    }
+
+	    if (!strncmp(p, "biomerge",8)) {
+		    iommu_bio_merge = 4096;
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "panic",5))
+		    panic_on_overflow = 1;
+	    if (!strncmp(p, "nopanic",7))
+		    panic_on_overflow = 0;
+	    if (!strncmp(p, "merge",5)) {
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "nomerge",7))
+		    iommu_merge = 0;
+	    if (!strncmp(p, "forcesac",8))
+		    iommu_sac_force = 1;
+
+#ifdef CONFIG_SWIOTLB
+	    if (!strncmp(p, "soft",4))
+		    swiotlb = 1;
+#endif
+
+#ifdef CONFIG_IOMMU
+	    gart_parse_options(p);
+#endif
+
+	    p += strcspn(p, ",");
+	    if (*p == ',')
+		    ++p;
+    }
+    return 1;
+}
+__setup("iommu=", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_IOMMU
+	iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+	calgary_iommu_init();
+#endif
+
+#ifdef CONFIG_IOMMU
+	gart_iommu_init();
+#endif
+
+	no_iommu_init();
+	return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,741 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+static unsigned long iommu_size; 	/* size of remapping area bytes */
+static unsigned long iommu_pages;	/* .. and in pages */
+
+u32 *iommu_gatt_base; 		/* Remapping table */
+
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+
+static u32 gart_unmapped_entry; 
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define to_pages(addr,size) \
+	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush; 		/* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(int size) 
+{ 	
+	unsigned long offset, flags;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);	
+	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+	if (offset == -1) {
+		need_flush = 1;
+		offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+	}
+	if (offset != -1) { 
+		set_bit_string(iommu_gart_bitmap, offset, size); 
+		next_bit = offset+size; 
+		if (next_bit >= iommu_pages) { 
+			next_bit = 0;
+			need_flush = 1;
+		} 
+	} 
+	if (iommu_fullflush)
+		need_flush = 1;
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+	return offset;
+} 
+
+static void free_iommu(unsigned long offset, int size)
+{ 
+	unsigned long flags;
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	__clear_bit_string(iommu_gart_bitmap, offset, size);
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{ 
+	unsigned long flags;
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	if (need_flush) {
+		k8_flush_garts();
+		need_flush = 0;
+	} 
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+#ifdef CONFIG_IOMMU_LEAK
+
+#define SET_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = NULL;
+
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+	int i;
+	static int dump; 
+	if (dump || !iommu_leak_tab) return;
+	dump = 1;
+	show_stack(NULL,NULL);
+	/* Very crude. dump some from the end of the table too */ 
+	printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+	for (i = 0; i < iommu_leak_pages; i+=2) {
+		printk("%lu: ", iommu_pages-i);
+		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+		printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+	} 
+	printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+	/* 
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * Return some non mapped prereserved space in the aperture and 
+	 * let the Northbridge deal with it. This will result in garbage
+	 * in the IO operation. When the size exceeds the prereserved space
+	 * memory corruption will occur or random memory will be DMAed 
+	 * out. Hopefully no network devices use single mappings that big.
+	 */ 
+	
+	printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+	       size, dev->bus_id);
+
+	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic("PCI-DMA: Memory would be corrupted\n");
+		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+			panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
+	} 
+
+#ifdef CONFIG_IOMMU_LEAK
+	dump_leak(); 
+#endif
+} 
+
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	if (force_iommu) 
+		mmu = 1; 
+	return mmu; 
+}
+
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	return mmu; 
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+				size_t size, int dir)
+{ 
+	unsigned long npages = to_pages(phys_mem, size);
+	unsigned long iommu_page = alloc_iommu(npages);
+	int i;
+	if (iommu_page == -1) {
+		if (!nonforced_iommu(dev, phys_mem, size))
+			return phys_mem; 
+		if (panic_on_overflow)
+			panic("dma_map_area overflow %lu bytes\n", size);
+		iommu_full(dev, size, dir);
+		return bad_dma_address;
+	}
+
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+		SET_LEAK(iommu_page + i);
+		phys_mem += PAGE_SIZE;
+	}
+	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+static dma_addr_t gart_map_simple(struct device *dev, char *buf,
+				 size_t size, int dir)
+{
+	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+	flush_gart();
+	return map;
+}
+
+/* Map a single area into the IOMMU */
+dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+	unsigned long phys_mem, bus;
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (!dev)
+		dev = &fallback_dev;
+
+	phys_mem = virt_to_bus(addr); 
+	if (!need_iommu(dev, phys_mem, size))
+		return phys_mem; 
+
+	bus = gart_map_simple(dev, addr, size, dir);
+	return bus; 
+}
+
+/*
+ * Free a DMA mapping.
+ */
+void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+		      size_t size, int direction)
+{
+	unsigned long iommu_page;
+	int npages;
+	int i;
+
+	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+	    dma_addr >= iommu_bus_base + iommu_size)
+		return;
+	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+	npages = to_pages(dma_addr, size);
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+		CLEAR_LEAK(iommu_page + i);
+	}
+	free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+
+	for (i = 0; i < nents; i++) {
+		struct scatterlist *s = &sg[i];
+		if (!s->dma_length || !s->length)
+			break;
+		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
+	}
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+			       int nents, int dir)
+{
+	int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+	printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+
+ 	for (i = 0; i < nents; i++ ) {
+		struct scatterlist *s = &sg[i];
+		unsigned long addr = page_to_phys(s->page) + s->offset; 
+		if (nonforced_iommu(dev, addr, s->length)) { 
+			addr = dma_map_area(dev, addr, s->length, dir);
+			if (addr == bad_dma_address) { 
+				if (i > 0) 
+					gart_unmap_sg(dev, sg, i, dir);
+				nents = 0; 
+				sg[0].dma_length = 0;
+				break;
+			}
+		}
+		s->dma_address = addr;
+		s->dma_length = s->length;
+	}
+	flush_gart();
+	return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout, unsigned long pages)
+{
+	unsigned long iommu_start = alloc_iommu(pages);
+	unsigned long iommu_page = iommu_start; 
+	int i;
+
+	if (iommu_start == -1)
+		return -1;
+	
+	for (i = start; i < stopat; i++) {
+		struct scatterlist *s = &sg[i];
+		unsigned long pages, addr;
+		unsigned long phys_addr = s->dma_address;
+		
+		BUG_ON(i > start && s->offset);
+		if (i == start) {
+			*sout = *s; 
+			sout->dma_address = iommu_bus_base;
+			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+			sout->dma_length = s->length;
+		} else { 
+			sout->dma_length += s->length; 
+		}
+
+		addr = phys_addr;
+		pages = to_pages(s->offset, s->length); 
+		while (pages--) { 
+			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+			SET_LEAK(iommu_page);
+			addr += PAGE_SIZE;
+			iommu_page++;
+		}
+	} 
+	BUG_ON(iommu_page - iommu_start != pages);	
+	return 0;
+}
+
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout,
+		      unsigned long pages, int need)
+{
+	if (!need) { 
+		BUG_ON(stopat - start != 1);
+		*sout = sg[start]; 
+		sout->dma_length = sg[start].length; 
+		return 0;
+	} 
+	return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+		
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+	int out;
+	int start;
+	unsigned long pages = 0;
+	int need = 0, nextneed;
+
+	BUG_ON(dir == DMA_NONE);
+	if (nents == 0) 
+		return 0;
+
+	if (!dev)
+		dev = &fallback_dev;
+
+	out = 0;
+	start = 0;
+	for (i = 0; i < nents; i++) {
+		struct scatterlist *s = &sg[i];
+		dma_addr_t addr = page_to_phys(s->page) + s->offset;
+		s->dma_address = addr;
+		BUG_ON(s->length == 0); 
+
+		nextneed = need_iommu(dev, addr, s->length); 
+
+		/* Handle the previous not yet processed entries */
+		if (i > start) {
+			struct scatterlist *ps = &sg[i-1];
+			/* Can only merge when the last chunk ends on a page 
+			   boundary and the new one doesn't have an offset. */
+			if (!iommu_merge || !nextneed || !need || s->offset ||
+			    (ps->offset + ps->length) % PAGE_SIZE) { 
+				if (dma_map_cont(sg, start, i, sg+out, pages,
+						 need) < 0)
+					goto error;
+				out++;
+				pages = 0;
+				start = i;	
+			}
+		}
+
+		need = nextneed;
+		pages += to_pages(s->offset, s->length);
+	}
+	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+		goto error;
+	out++;
+	flush_gart();
+	if (out < nents) 
+		sg[out].dma_length = 0; 
+	return out;
+
+error:
+	flush_gart();
+	gart_unmap_sg(dev, sg, nents, dir);
+	/* When it was forced or merged try again in a dumb way */
+	if (force_iommu || iommu_merge) {
+		out = dma_map_sg_nonforce(dev, sg, nents, dir);
+		if (out > 0)
+			return out;
+	}
+	if (panic_on_overflow)
+		panic("dma_map_sg: overflow on %lu pages\n", pages);
+	iommu_full(dev, pages << PAGE_SHIFT, dir);
+	for (i = 0; i < nents; i++)
+		sg[i].dma_address = bad_dma_address;
+	return 0;
+} 
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+	unsigned long a; 
+	if (!iommu_size) { 
+		iommu_size = aper_size; 
+		if (!no_agp) 
+			iommu_size /= 2; 
+	} 
+
+	a = aper + iommu_size; 
+	iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+
+	if (iommu_size < 64*1024*1024) 
+		printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+	
+	return iommu_size;
+} 
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+	unsigned aper_size = 0, aper_base_32;
+	u64 aper_base;
+	unsigned aper_order;
+
+	pci_read_config_dword(dev, 0x94, &aper_base_32); 
+	pci_read_config_dword(dev, 0x90, &aper_order);
+	aper_order = (aper_order >> 1) & 7;	
+
+	aper_base = aper_base_32 & 0x7fff; 
+	aper_base <<= 25;
+
+	aper_size = (32 * 1024 * 1024) << aper_order; 
+	if (aper_base + aper_size >= 0xffffffff || !aper_size)
+		aper_base = 0;
+
+	*size = aper_size;
+	return aper_base;
+} 
+
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+	struct pci_dev *dev;
+	void *gatt;
+	unsigned aper_base, new_aper_base;
+	unsigned aper_size, gatt_size, new_aper_size;
+	int i;
+
+	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+	aper_size = aper_base = info->aper_size = 0;
+	dev = NULL;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		dev = k8_northbridges[i];
+		new_aper_base = read_aperture(dev, &new_aper_size); 
+		if (!new_aper_base) 
+			goto nommu; 
+		
+		if (!aper_base) { 
+			aper_size = new_aper_size;
+			aper_base = new_aper_base;
+		} 
+		if (aper_size != new_aper_size || aper_base != new_aper_base) 
+			goto nommu;
+	}
+	if (!aper_base)
+		goto nommu; 
+	info->aper_base = aper_base;
+	info->aper_size = aper_size>>20; 
+
+	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+	if (!gatt) 
+		panic("Cannot allocate GATT table"); 
+#ifdef CONFIG_XEN
+	if (xen_create_contiguous_region((unsigned long)gatt, get_order(gatt_size), 31))
+		panic("Cannot create a contiguous memory region below 4GB for the GATT table");
+#endif
+	memset(gatt, 0, gatt_size); 
+	agp_gatt_table = gatt;
+
+	for (i = 0; i < num_k8_northbridges; i++) {
+		u32 ctl; 
+		u32 gatt_reg; 
+
+		dev = k8_northbridges[i];
+		gatt_reg = phys_to_machine(__pa(gatt)) >> 12;
+		gatt_reg <<= 4; 
+		pci_write_config_dword(dev, 0x98, gatt_reg);
+		pci_read_config_dword(dev, 0x90, &ctl); 
+
+		ctl |= 1;
+		ctl &= ~((1<<4) | (1<<5));
+
+		pci_write_config_dword(dev, 0x90, ctl); 
+	}
+	flush_gart();
+	
+	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+	return 0;
+
+ nommu:
+ 	/* Should not happen anymore */
+	printk(KERN_ERR "PCI-DMA: init k8_gat More than 4GB of RAM and no IOMMU\n"
+	       KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
+	return -1; 
+} 
+
+extern int agp_amd64_init(void);
+
+static struct dma_mapping_ops gart_dma_ops = {
+	.mapping_error = NULL,
+	.map_single = gart_map_single,
+	.map_simple = gart_map_simple,
+	.unmap_single = gart_unmap_single,
+	.sync_single_for_cpu = NULL,
+	.sync_single_for_device = NULL,
+	.sync_single_range_for_cpu = NULL,
+	.sync_single_range_for_device = NULL,
+	.sync_sg_for_cpu = NULL,
+	.sync_sg_for_device = NULL,
+	.map_sg = gart_map_sg,
+	.unmap_sg = gart_unmap_sg,
+};
+
+void __init gart_iommu_init(void)
+{ 
+	struct agp_kern_info info;
+	unsigned long aper_size;
+	unsigned long iommu_start;
+	unsigned long scratch;
+	long i;
+
+	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+		return;
+	}
+
+#ifndef CONFIG_AGP_AMD64
+	no_agp = 1; 
+#else
+	/* Makefile puts PCI initialization via subsys_initcall first. */
+	/* Add other K8 AGP bridge drivers here */
+	no_agp = no_agp || 
+		(agp_amd64_init() < 0) || 
+		(agp_copy_info(agp_bridge, &info) < 0);
+#endif	
+
+	if (swiotlb)
+		return;
+
+	/* Did we detect a different HW IOMMU? */
+	if (iommu_detected && !iommu_aperture)
+		return;
+
+	if (no_iommu ||
+	    (!force_iommu && !global_need_iommu()) ||
+	    !iommu_aperture ||
+	    (no_agp && init_k8_gatt(&info) < 0)) {
+		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
+		if (global_need_iommu()) {
+			printk(KERN_ERR "WARNING more than 4GB of memory "
+					"but IOMMU not available.\n"
+			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+		}
+		return;
+	}
+
+	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+	aper_size = info.aper_size * 1024 * 1024;	
+	iommu_size = check_iommu_size(info.aper_base, aper_size); 
+	iommu_pages = iommu_size >> PAGE_SHIFT; 
+
+	iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+						    get_order(iommu_pages/8)); 
+	if (!iommu_gart_bitmap) 
+		panic("Cannot allocate iommu bitmap\n"); 
+	memset(iommu_gart_bitmap, 0, iommu_pages/8);
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (leak_trace) { 
+		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+				  get_order(iommu_pages*sizeof(void *)));
+		if (iommu_leak_tab) 
+			memset(iommu_leak_tab, 0, iommu_pages * 8); 
+		else
+			printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+	} 
+#endif
+
+	/* 
+	 * Out of IOMMU space handling.
+	 * Reserve some invalid pages at the beginning of the GART. 
+	 */ 
+	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+
+	agp_memory_reserved = iommu_size;	
+	printk(KERN_INFO
+	       "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+	       iommu_size>>20); 
+
+	iommu_start = aper_size - iommu_size;	
+	iommu_bus_base = info.aper_base + iommu_start; 
+	bad_dma_address = iommu_bus_base;
+	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+	/* 
+	 * Unmap the IOMMU part of the GART. The alias of the page is
+	 * always mapped with cache enabled and there is no full cache
+	 * coherency across the GART remapping. The unmapping avoids
+	 * automatic prefetches from the CPU allocating cache lines in
+	 * there. All CPU accesses are done via the direct mapping to
+	 * the backing memory. The GART address is only used by PCI
+	 * devices. 
+	 */
+#ifndef CONFIG_XEN
+       clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size);
+#else
+       /*
+        * In xen guests and hypervisor the IOMMU region is only mapped in
+        * virtual memory if it has been allocated from RAM.
+        */
+	if (mfn_to_local_pfn(iommu_bus_base>>PAGE_SHIFT)<end_pfn) {
+		struct mmuext_op op = {
+			.cmd = MMUEXT_UNMAP_REGION,
+			.arg1.mfn = iommu_bus_base>>PAGE_SHIFT,
+			.arg2.nr_ents = iommu_size>>PAGE_SHIFT
+		};
+		clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size);
+		BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF));
+	}
+#endif
+
+	/* 
+	 * Try to workaround a bug (thanks to BenH) 
+	 * Set unmapped entries to a scratch page instead of 0. 
+	 * Any prefetches that hit unmapped entries won't get an bus abort
+	 * then.
+	 */
+	scratch = get_zeroed_page(GFP_KERNEL); 
+	if (!scratch) 
+		panic("Cannot allocate iommu scratch page");
+	gart_unmapped_entry = GPTE_ENCODE(virt_to_bus(scratch));
+	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+		iommu_gatt_base[i] = gart_unmapped_entry;
+
+	flush_gart();
+	iommu_detected = 1;
+	dma_ops = &gart_dma_ops;
+	printk(KERN_INFO "GART: ops set\n");
+} 
+
+void gart_parse_options(char *p)
+{
+	int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (!strncmp(p,"leak",4)) {
+		leak_trace = 1;
+		p += 4;
+		if (*p == '=') ++p;
+		if (isdigit(*p) && get_option(&p, &arg))
+			iommu_leak_pages = arg;
+	}
+#endif
+	if (isdigit(*p) && get_option(&p, &arg))
+		iommu_size = arg;
+	if (!strncmp(p, "fullflush",8))
+		iommu_fullflush = 1;
+	if (!strncmp(p, "nofullflush",11))
+		iommu_fullflush = 0;
+	if (!strncmp(p,"noagp",5))
+		no_agp = 1;
+	if (!strncmp(p, "noaperture",10))
+		fix_aperture = 0;
+	/* duplicated from pci-dma.c */
+	if (!strncmp(p,"force",5))
+		iommu_aperture_allowed = 1;
+	if (!strncmp(p,"allowed",7))
+		iommu_aperture_allowed = 1;
+	if (!strncmp(p, "memaper", 7)) {
+		fallback_aper_force = 1;
+		p += 7;
+		if (*p == '=') {
+			++p;
+			if (get_option(&p, &arg))
+				fallback_aper_order = arg;
+		}
+	}
+}
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/swiotlb-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/lib/swiotlb-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,831 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is for IA-64 and EM64T platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm	Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm	Rename to swiotlb.c and add mark_clean() to avoid
+ *			unnecessary i-cache flushing.
+ * 04/07/.. ak		Better overflow handling. Assorted fixes.
+ * 05/09/10 linville	Add support for syncing ranges, support syncing for
+ *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ */
+
+#include <linux/cache.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <asm/scatterlist.h>
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+
+#define OFFSET(val,align) ((unsigned long)	\
+	                   ( (val) & ( (align) - 1)))
+
+#define SG_ENT_VIRT_ADDRESS(sg)	(page_address((sg)->page) + (sg)->offset)
+#define SG_ENT_PHYS_ADDRESS(SG)	virt_to_bus(SG_ENT_VIRT_ADDRESS(SG))
+
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2.  What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE	128
+
+/*
+ * log of the size of each IO TLB slab.  The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with.  Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb.  If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+/*
+ * Enumeration for sync targets
+ */
+enum dma_sync_target {
+	SYNC_FOR_CPU = 0,
+	SYNC_FOR_DEVICE = 1,
+};
+
+int swiotlb_force;
+
+/*
+ * Used to do a quick range check in swiotlb_unmap_single and
+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static char *io_tlb_start, *io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static unsigned char **io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+	if (isdigit(*str)) {
+		io_tlb_nslabs = simple_strtoul(str, &str, 0);
+		/* avoid tail segment of size < IO_TLB_SEGSIZE */
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+	if (*str == ',')
+		++str;
+	if (!strcmp(str, "force"))
+		swiotlb_force = 1;
+	return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void
+swiotlb_init_with_default_size (size_t default_size)
+{
+	unsigned long i;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	if (!io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer");
+#ifdef CONFIG_XEN
+	if (xen_create_contiguous_region((unsigned long)io_tlb_start, get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT)), 31))
+		panic("Cannot create a contiguous memory region below 4GB for the SWIOTLB buffer");
+#endif
+//TODO ulim: make contiguous
+	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+	 * between io_tlb_start and io_tlb_end.
+	 */
+	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+#ifdef CONFIG_XEN
+	if (io_tlb_overflow_buffer)
+		xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31);
+#endif
+//TODO ulim: make contiguous
+	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
+	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+}
+
+void
+swiotlb_init (void)
+{
+	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
+}
+
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int
+swiotlb_late_init_with_default_size (size_t default_size)
+{
+	unsigned long i, req_nslabs = io_tlb_nslabs;
+	unsigned int order;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	order = get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	io_tlb_nslabs = SLABS_PER_PAGE << order;
+
+	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+//TODO ulim: make contiguous
+		io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+		                                        order);
+#ifdef CONFIG_XEN
+	if (io_tlb_start&&!xen_create_contiguous_region((unsigned long)io_tlb_start, order, 31)) {
+		free_pages((unsigned long)io_tlb_start, order);
+		io_tlb_start = NULL;
+	}
+#endif
+		if (io_tlb_start)
+			break;
+		order--;
+	}
+
+	if (!io_tlb_start)
+		goto cleanup1;
+
+	if (order != get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT))) {
+		printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+		io_tlb_nslabs = SLABS_PER_PAGE << order;
+	}
+	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+	memset(io_tlb_start, 0, io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+	 * between io_tlb_start and io_tlb_end.
+	 */
+	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+	                              get_order(io_tlb_nslabs * sizeof(int)));
+	if (!io_tlb_list)
+		goto cleanup2;
+
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+
+	io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL,
+	                           get_order(io_tlb_nslabs * sizeof(char *)));
+	if (!io_tlb_orig_addr)
+		goto cleanup3;
+
+	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+	                                          get_order(io_tlb_overflow));
+#ifdef CONFIG_XEN
+	if (io_tlb_overflow_buffer&&!xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31)) {
+		free_pages((unsigned long)io_tlb_overflow_buffer, order);
+		io_tlb_overflow_buffer = NULL;
+	}
+#endif
+	if (!io_tlb_overflow_buffer)
+		goto cleanup4;
+
+//TODO ulim: make contiguous
+	printk(KERN_INFO "Placing %ldMB software IO TLB between 0x%lx - "
+	       "0x%lx\n", (io_tlb_nslabs * (1 << IO_TLB_SHIFT)) >> 20,
+	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+
+	return 0;
+
+cleanup4:
+	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+	                                                      sizeof(char *)));
+	io_tlb_orig_addr = NULL;
+cleanup3:
+	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+	                                                 sizeof(int)));
+	io_tlb_list = NULL;
+	io_tlb_end = NULL;
+cleanup2:
+#ifdef CONFIG_XEN
+	xen_destroy_contiguous_region((unsigned long)io_tlb_start, order);
+#endif
+	free_pages((unsigned long)io_tlb_start, order);
+//TODO ulim: make contiguous
+	io_tlb_start = NULL;
+cleanup1:
+	io_tlb_nslabs = req_nslabs;
+	return -ENOMEM;
+}
+
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+	dma_addr_t mask = 0xffffffff;
+	/* If the device has a mask, use it, otherwise default to 32 bits */
+	if (hwdev && hwdev->dma_mask)
+		mask = *hwdev->dma_mask;
+	return (addr & ~mask) != 0;
+}
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+static void *
+map_single(struct device *hwdev, char *buffer, size_t size, int dir)
+{
+	unsigned long flags;
+	char *dma_addr;
+	unsigned int nslots, stride, index, wrap;
+	int i;
+
+	/*
+	 * For mappings greater than a page, we limit the stride (and
+	 * hence alignment) to a page size.
+	 */
+	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	if (size > PAGE_SIZE)
+		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+	else
+		stride = 1;
+
+	BUG_ON(!nslots);
+
+	/*
+	 * Find suitable number of IO TLB entries size that will fit this
+	 * request and allocate a buffer from that IO TLB pool.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	{
+		wrap = index = ALIGN(io_tlb_index, stride);
+
+		if (index >= io_tlb_nslabs)
+			wrap = index = 0;
+
+		do {
+			/*
+			 * If we find a slot that indicates we have 'nslots'
+			 * number of contiguous buffers, we allocate the
+			 * buffers from that slot and mark the entries as '0'
+			 * indicating unavailable.
+			 */
+			if (io_tlb_list[index] >= nslots) {
+				int count = 0;
+
+				for (i = index; i < (int) (index + nslots); i++)
+					io_tlb_list[i] = 0;
+				for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+					io_tlb_list[i] = ++count;
+				dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+				/*
+				 * Update the indices to avoid searching in
+				 * the next round.
+				 */
+				io_tlb_index = ((index + nslots) < io_tlb_nslabs
+						? (index + nslots) : 0);
+
+				goto found;
+			}
+			index += stride;
+			if (index >= io_tlb_nslabs)
+				index = 0;
+		} while (index != wrap);
+
+		spin_unlock_irqrestore(&io_tlb_lock, flags);
+		return NULL;
+	}
+  found:
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+	/*
+	 * Save away the mapping from the original address to the DMA address.
+	 * This is needed when we sync the memory.  Then we sync the buffer if
+	 * needed.
+	 */
+	io_tlb_orig_addr[index] = buffer;
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		memcpy(dma_addr, buffer, size);
+
+	return dma_addr;
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+static void
+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+	unsigned long flags;
+	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	char *buffer = io_tlb_orig_addr[index];
+
+	/*
+	 * First, sync the memory before unmapping the entry
+	 */
+	if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+		/*
+		 * bounce... copy the data back into the original buffer * and
+		 * delete the bounce buffer.
+		 */
+		memcpy(buffer, dma_addr, size);
+
+	/*
+	 * Return the buffer to the free list by setting the corresponding
+	 * entries to indicate the number of contigous entries available.
+	 * While returning the entries to the free list, we merge the entries
+	 * with slots below and above the pool being returned.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	{
+		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+			 io_tlb_list[index + nslots] : 0);
+		/*
+		 * Step 1: return the slots to the free list, merging the
+		 * slots with superceeding slots
+		 */
+		for (i = index + nslots - 1; i >= index; i--)
+			io_tlb_list[i] = ++count;
+		/*
+		 * Step 2: merge the returned slots with the preceding slots,
+		 * if available (non zero)
+		 */
+		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+			io_tlb_list[i] = ++count;
+	}
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+
+static void
+sync_single(struct device *hwdev, char *dma_addr, size_t size,
+	    int dir, int target)
+{
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	char *buffer = io_tlb_orig_addr[index];
+
+	switch (target) {
+	case SYNC_FOR_CPU:
+		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(buffer, dma_addr, size);
+		else
+			BUG_ON(dir != DMA_TO_DEVICE);
+		break;
+	case SYNC_FOR_DEVICE:
+		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(dma_addr, buffer, size);
+		else
+			BUG_ON(dir != DMA_FROM_DEVICE);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void *
+swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+		       dma_addr_t *dma_handle, gfp_t flags)
+{
+	unsigned long dev_addr;
+	void *ret;
+	int order = get_order(size);
+
+	/*
+	 * XXX fix me: the DMA API should pass us an explicit DMA mask
+	 * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32
+	 * bit range instead of a 16MB one).
+	 */
+	flags |= GFP_DMA;
+
+	ret = (void *)__get_free_pages(flags, order);
+	if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) {
+		/*
+		 * The allocated memory isn't reachable by the device.
+		 * Fall back on swiotlb_map_single().
+		 */
+		free_pages((unsigned long) ret, order);
+		ret = NULL;
+	}
+	if (!ret) {
+		/*
+		 * We are either out of memory or the device can't DMA
+		 * to GFP_DMA memory; fall back on
+		 * swiotlb_map_single(), which will grab memory from
+		 * the lowest available address range.
+		 */
+		dma_addr_t handle;
+		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
+		if (swiotlb_dma_mapping_error(handle))
+			return NULL;
+
+		ret = bus_to_virt(handle);
+	}
+
+	memset(ret, 0, size);
+	dev_addr = virt_to_bus(ret);
+
+	/* Confirm address can be DMA'd by device */
+	if (address_needs_mapping(hwdev, dev_addr)) {
+		printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n",
+		       (unsigned long long)*hwdev->dma_mask, dev_addr);
+		panic("swiotlb_alloc_coherent: allocated memory is out of "
+		      "range for device");
+	}
+	*dma_handle = dev_addr;
+	return ret;
+}
+
+void
+swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+		      dma_addr_t dma_handle)
+{
+	if (!(vaddr >= (void *)io_tlb_start
+                    && vaddr < (void *)io_tlb_end))
+		free_pages((unsigned long) vaddr, get_order(size));
+	else
+		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+		swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE);
+}
+
+static void
+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+	/*
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * unless they check for dma_mapping_error (most don't)
+	 * When the mapping is small enough return a static buffer to limit
+	 * the damage, or panic when the transfer is too big.
+	 */
+	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %lu bytes at "
+	       "device %s\n", size, dev ? dev->bus_id : "?");
+
+	if (size > io_tlb_overflow && do_panic) {
+		if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Memory would be corrupted\n");
+		if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Random memory would be DMAed\n");
+	}
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
+{
+	unsigned long dev_addr = virt_to_bus(ptr);
+	void *map;
+
+	BUG_ON(dir == DMA_NONE);
+	/*
+	 * If the pointer passed in happens to be in the device's DMA window,
+	 * we can safely return the device addr and not worry about bounce
+	 * buffering it.
+	 */
+	if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force)
+		return dev_addr;
+
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
+	map = map_single(hwdev, ptr, size, dir);
+	if (!map) {
+		swiotlb_full(hwdev, size, dir, 1);
+		map = io_tlb_overflow_buffer;
+	}
+
+	dev_addr = virt_to_bus(map);
+
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
+	if (address_needs_mapping(hwdev, dev_addr))
+		panic("map_single: bounce buffer is not DMA'ble");
+
+	return dev_addr;
+}
+
+/*
+ * Since DMA is i-cache coherent, any (complete) pages that were written via
+ * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
+ * flush them when they get mapped into an executable vm-area.
+ */
+static void
+mark_clean(void *addr, size_t size)
+{
+	unsigned long pg_addr, end;
+
+	pg_addr = PAGE_ALIGN((unsigned long) addr);
+	end = (unsigned long) addr + size;
+	while (pg_addr + PAGE_SIZE <= end) {
+		struct page *page = virt_to_page(pg_addr);
+		set_bit(PG_arch_1, &page->flags);
+		pg_addr += PAGE_SIZE;
+	}
+}
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_single call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
+		     int dir)
+{
+	char *dma_addr = bus_to_virt(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		unmap_single(hwdev, dma_addr, size, dir);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so.  At the next point you give the dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static inline void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+		    size_t size, int dir, int target)
+{
+	char *dma_addr = bus_to_virt(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		sync_single(hwdev, dma_addr, size, dir, target);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, int dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+			       size_t size, int dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Same as above, but for a sub-range of the mapping.
+ */
+static inline void
+swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
+			  unsigned long offset, size_t size,
+			  int dir, int target)
+{
+	char *dma_addr = bus_to_virt(dev_addr) + offset;
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		sync_single(hwdev, dma_addr, size, dir, target);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				  unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				     unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_DEVICE);
+}
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_single
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * same here.
+ */
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+	       int dir)
+{
+	void *addr;
+	unsigned long dev_addr;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		dev_addr = virt_to_bus(addr);
+		if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
+			void *map = map_single(hwdev, addr, sg->length, dir);
+			sg->dma_address = virt_to_bus(map);
+			if (!map) {
+				/* Don't panic here, we expect map_sg users
+				   to do proper error handling. */
+				swiotlb_full(hwdev, sg->length, dir, 0);
+				swiotlb_unmap_sg(hwdev, sg - i, i, dir);
+				sg[0].dma_length = 0;
+				return 0;
+			}
+		} else
+			sg->dma_address = dev_addr;
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_single() above.
+ */
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+		 int dir)
+{
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++)
+		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+			unmap_single(hwdev, (void *) bus_to_virt(sg->dma_address), sg->dma_length, dir);
+		else if (dir == DMA_FROM_DEVICE)
+			mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+}
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static inline void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sg,
+		int nelems, int dir, int target)
+{
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++)
+		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+			sync_single(hwdev, (void *) sg->dma_address,
+				    sg->dma_length, dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			int nelems, int dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			   int nelems, int dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+
+int
+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+{
+	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+}
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+	return (virt_to_bus (io_tlb_end) - 1) <= mask;
+}
+
+EXPORT_SYMBOL(swiotlb_init);
+EXPORT_SYMBOL(swiotlb_map_single);
+EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL(swiotlb_map_sg);
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+EXPORT_SYMBOL(swiotlb_alloc_coherent);
+EXPORT_SYMBOL(swiotlb_free_coherent);
+EXPORT_SYMBOL(swiotlb_dma_supported);