# HG changeset patch # User root@iwillnow.amd.com # Node ID 502810b11c7972aabebd8e898e45f63189a493e5 # Parent d39e8c44da34ac257b26cce29122852ef3a930cc Add support for the Opteron AGP GART/aperture as a simplified IOMMU diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c --- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Fri Feb 09 16:32:04 2007 -0600 @@ -252,7 +252,7 @@ static void contiguous_bitmap_clear( } /* Protected by balloon_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ +#define MAX_CONTIG_ORDER 16 /* 256MB */ static unsigned long discontig_frames[1< is mapped * with type. @@ -116,7 +115,6 @@ e820_any_mapped(unsigned long start, uns } return 0; } -#endif /* * This function checks if the entire range is mapped with type. diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -369,7 +369,7 @@ void __init check_ioapic(void) switch (vendor) { case PCI_VENDOR_ID_VIA: #ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || + if ((global_need_iommu() || force_iommu) && !iommu_aperture_allowed) { printk(KERN_INFO diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -8,13 +8,10 @@ #include #include -#if 0 int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); -#endif struct dma_mapping_ops swiotlb_dma_ops = { -#if 0 .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, @@ -29,14 +26,12 @@ struct dma_mapping_ops swiotlb_dma_ops = .map_sg = swiotlb_map_sg, .unmap_sg = swiotlb_unmap_sg, .dma_supported = NULL, -#endif }; void pci_swiotlb_init(void) { -#if 0 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) + if (!iommu_detected && !no_iommu) swiotlb = 1; if (swiotlb_force) swiotlb = 1; @@ -45,11 +40,4 @@ void pci_swiotlb_init(void) swiotlb_init(); dma_ops = &swiotlb_dma_ops; } -#else - swiotlb_init(); - if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - dma_ops = &swiotlb_dma_ops; - } -#endif } diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -844,6 +844,7 @@ void __init paging_init(void) } #endif +#ifndef CONFIG_XEN /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches from the CPU leading to inconsistent cache lines. address and size must be aligned to 2MB boundaries. @@ -877,6 +878,39 @@ void __init clear_kernel_mapping(unsigne } __flush_tlb_all(); } +#else +/* In Xen the direct mapping doesn't use super pages. */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) +{ + unsigned long end = address + size; + + BUG_ON(address & ~PAGE_MASK); + BUG_ON(size & ~PAGE_MASK); + + for (; address < end; address += PAGE_SIZE) { + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, address); + if (!pmd || pmd_none(*pmd)) + continue; + pte = pte_offset_map(pmd, address); + if (!pte || pte_none(*pte)) + continue; + + pte_clear(&init_mm,address,pte); + + } + __flush_tlb_all(); +} +#endif /* * Memory hotplug specific functions diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h Fri Feb 09 16:32:04 2007 -0600 @@ -62,7 +62,12 @@ static inline int valid_dma_direction(in (dma_direction == DMA_FROM_DEVICE)); } -#if 0 +#ifdef CONFIG_XEN +#define global_need_iommu() 1 +#else +#define global_need_iommu() (HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL)>MAX_DMA32_PFN) +#endif + static inline int dma_mapping_error(dma_addr_t dma_addr) { if (dma_ops->mapping_error) @@ -200,8 +205,5 @@ dma_cache_sync(void *vaddr, size_t size, extern struct device fallback_dev; extern int panic_on_overflow; -#endif #endif /* _X8664_DMA_MAPPING_H */ - -#include diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/Makefile --- a/linux-2.6-xen-sparse/lib/Makefile Fri Feb 09 10:48:41 2007 +0000 +++ b/linux-2.6-xen-sparse/lib/Makefile Fri Feb 09 16:32:04 2007 -0600 @@ -51,8 +51,7 @@ obj-$(CONFIG_SMP) += percpu_counter.o obj-$(CONFIG_SMP) += percpu_counter.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o -swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o +obj-$(CONFIG_SWIOTLB) += swiotlb-xen.o hostprogs-y := gen_crc32table clean-files := crc32table.h diff -r d39e8c44da34 -r 502810b11c79 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Feb 09 10:48:41 2007 +0000 +++ b/xen/arch/x86/mm.c Fri Feb 09 16:32:04 2007 -0600 @@ -2094,6 +2094,12 @@ int do_mmuext_op( } } break; + + case MMUEXT_UNMAP_REGION: + map_pages_to_xen( + PAGE_OFFSET + (op.arg1.mfn << PAGE_SHIFT), + op.arg1.mfn, op.arg2.nr_ents, PAGE_HYPERVISOR & ~_PAGE_PRESENT); + break; #endif case MMUEXT_TLB_FLUSH_LOCAL: diff -r d39e8c44da34 -r 502810b11c79 xen/include/public/xen.h --- a/xen/include/public/xen.h Fri Feb 09 10:48:41 2007 +0000 +++ b/xen/include/public/xen.h Fri Feb 09 16:32:04 2007 -0600 @@ -188,6 +188,10 @@ * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only] * mfn: Machine frame number of new page-table base to install in MMU * when in user space. + * + * cmd: MMUEXT_UNMAP_REGION [x86/64 only] + * mfn: First machine frame number to unmap from the hypervisor's virtual memory. + * nr_ents: Number of mfns to unmap. * * cmd: MMUEXT_TLB_FLUSH_LOCAL * No additional arguments. Flushes local TLB. @@ -230,6 +234,7 @@ #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_UNMAP_REGION 16 #ifndef __ASSEMBLY__ struct mmuext_op { diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -0,0 +1,284 @@ +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture or only set the + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int iommu_aperture; +int iommu_aperture_disabled __initdata = 0; +int iommu_aperture_allowed __initdata = 0; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata = 0; + +int fix_aperture __initdata = 1; + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +static u32 __init allocate_aperture(void) +{ + pg_data_t *nd0 = NODE_DATA(0); + u32 aper_size; + void *p; + + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means an 2GB aperture won't + * have much chances to find a place in the lower 4GB of memory. + * Unfortunately we cannot move it up because that would make the + * IOMMU useless. + */ + p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); +#ifndef CONFIG_XEN + if (!p || __pa(p)+aper_size > 0xffffffff) { +#else + if (!p || (xen_create_contiguous_region((unsigned long)p, get_order(aper_size), 31) != 0)||virt_to_bus(p)+aper_size > 0xffffffff) { +#endif + printk("Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); + if (p) + free_bootmem_node(nd0, __pa(p), aper_size); + return 0; + } + printk("Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, virt_to_bus(p)); + return (u32)virt_to_bus(p); +} + +static int __init aperture_valid(u64 aper_base, u32 aper_size) +{ + if (!aper_base) + return 0; + if (aper_size < 64*1024*1024) { + printk("Aperture too small (%d MB)\n", aper_size>>20); + return 0; + } + if (aper_base + aper_size >= 0xffffffff) { + printk("Aperture beyond 4GB. Ignoring.\n"); + return 0; + } + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk("Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + return 1; +} + +/* Find a PCI capability */ +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ + u8 pos; + int bytes; + if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + return 0; + pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + pos &= ~3; + id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + + printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + if (apsizereg == 0xffffffff) { + printk("APSIZE in AGP bridge unreadable\n"); + return 0; + } + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(num,slot,func, 0x10); + aper_hi = read_pci_config(num,slot,func,0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); + + if (!aperture_valid(aper, (32*1024*1024) << *order)) + return 0; + return (u32)aper; +} + +/* Look for an AGP bridge. Windows only expects the aperture in the + AGP bridge and some BIOS forget to initialize the Northbridge too. + Work around this here. + + Do an PCI bus scan by hand because we're running before the PCI + subsystem. + + All K8 AGP bridges are AGPv3 compliant, so we can do this scan + generically. It's probably overkill to always scan all slots because + the AGP bridges should be always an own bus on the HT hierarchy, + but do it here for future safety. */ +static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int num, slot, func; + + /* Poor man's PCI discovery */ + for (num = 0; num < 256; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(num,slot,func,cap,order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + printk("No AGP bridge found\n"); + return 0; +} + +void __init iommu_hole_init(void) +{ + int fix, num; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int valid_agp = 0; + + if (iommu_aperture_disabled || !fix_aperture) + return; + + printk("Checking aperture...\n"); + + fix = 0; + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + iommu_detected = 1; + iommu_aperture = 1; + + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, + aper_base, aper_size>>20); + + if (!aperture_valid(aper_base, aper_size)) { + fix = 1; + break; + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + + if (!fix && !fallback_aper_force) + return; + + if (!fallback_aper_force) + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if (swiotlb && !valid_agp) { + /* Do nothing */ + } else if ((!no_iommu && global_need_iommu()) || + force_iommu || + valid_agp || + fallback_aper_force) { + printk("Your BIOS doesn't leave a aperture memory hole\n"); + printk("Please enable the IOMMU option in the BIOS setup\n"); + printk("This costs you %d MB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* Could disable AGP and IOMMU here, but it's probably + not worth it. But the later users cannot deal with + bad apertures and turning on the aperture over memory + causes very strange problems, so it's better to + panic early. */ + panic("Not enough memory for aperture"); + } + } else { + return; + } + + /* Fix up the north bridges */ + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -0,0 +1,328 @@ +/* + * Dynamic DMA mapping support. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int iommu_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address __read_mostly; +EXPORT_SYMBOL(bad_dma_address); + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +int iommu_sac_force __read_mostly = 0; +EXPORT_SYMBOL(iommu_sac_force); + +int no_iommu __read_mostly; +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly= 0; +#endif + +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to i386. */ +struct device fallback_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = DMA_32BIT_MASK, + .dma_mask = &fallback_dev.coherent_dma_mask, +}; + +/* Allocate DMA memory on node near device */ +noinline static void * +dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) +{ + struct page *page; + int node; +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + node = pcibus_to_node(to_pci_dev(dev)->bus); + else +#endif + node = numa_node_id(); + + if (node < first_node(node_online_map)) + node = first_node(node_online_map); + + page = alloc_pages_node(node, gfp, order); + return page ? page_address(page) : NULL; +} + +/* + * Allocate memory for a coherent mapping. + */ +void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) +{ + void *memory; + unsigned long dma_mask = 0; + u64 bus; + + if (!dev) + dev = &fallback_dev; + dma_mask = dev->coherent_dma_mask; + if (dma_mask == 0) + dma_mask = DMA_32BIT_MASK; + + /* Don't invoke OOM killer */ + gfp |= __GFP_NORETRY; + + /* Kludge to make it bug-to-bug compatible with i386. i386 + uses the normal dma_mask for alloc_coherent. */ + dma_mask &= *dev->dma_mask; + + /* Why <=? Even when the mask is smaller than 4GB it is often + larger than 16MB and in this case we have a chance of + finding fitting memory in the next higher zone first. If + not retry with true GFP_DMA. -AK */ + if (dma_mask <= DMA_32BIT_MASK) + gfp |= GFP_DMA32; + + again: + memory = dma_alloc_pages(dev, gfp, get_order(size)); + if (memory == NULL) + return NULL; + + { + int high, mmu; + bus = virt_to_bus(memory); + high = (bus + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) + mmu = 1; + else if (high) { + free_pages((unsigned long)memory, + get_order(size)); + + /* Don't use the 16MB ZONE_DMA unless absolutely + needed. It's better to use remapping first. */ + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + /* Let low level make its own zone decisions */ + gfp &= ~(GFP_DMA32|GFP_DMA); + + if (dma_ops->alloc_coherent) + return dma_ops->alloc_coherent(dev, size, + dma_handle, gfp); + return NULL; + } + + /* Hardcode 31 address bits for now: aacraid limitation. */ + if (xen_create_contiguous_region((unsigned long)memory, get_order(size), 31) != 0) { + free_pages((unsigned long)memory, get_order(size)); + return NULL; + } + + memset(memory, 0, size); + if (!mmu) { + *dma_handle = virt_to_bus(memory); + return memory; + } + } + + if (dma_ops->alloc_coherent) { + free_pages((unsigned long)memory, get_order(size)); + gfp &= ~(GFP_DMA|GFP_DMA32); + return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + } + + if (dma_ops->map_simple) { + *dma_handle = dma_ops->map_simple(dev, memory, + size, + PCI_DMA_BIDIRECTIONAL); + if (*dma_handle != bad_dma_address) + return memory; + } + + if (panic_on_overflow) + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size); + free_pages((unsigned long)memory, get_order(size)); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +/* + * Unmap coherent memory. + * The caller must ensure that the device has finished accessing the mapping. + */ +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) +{ + if (dma_ops->unmap_single) + dma_ops->unmap_single(dev, bus, size, 0); + xen_destroy_contiguous_region((unsigned long) vaddr, get_order(size)); + free_pages((unsigned long)vaddr, get_order(size)); +} +EXPORT_SYMBOL(dma_free_coherent); + +int dma_supported(struct device *dev, u64 mask) +{ + if (dma_ops->dma_supported) + return dma_ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_24BIT_MASK) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + *dev->dma_mask = mask; + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] + [,forcesac][,fullflush][,nomerge][,biomerge] + size set size of iommu (in bytes) + noagp don't initialize the AGP driver and use full aperture. + off don't use the IOMMU + leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) + memaper[=order] allocate an own aperture over RAM with size 32MB^order. + noforce don't force IOMMU usage. Default. + force Force IOMMU. + merge Do lazy merging. This may improve performance on some block devices. + Implies force (experimental) + biomerge Do merging at the BIO layer. This is more efficient than merge, + but should be only done with very big IOMMUs. Implies merge,force. + nomerge Don't do SG merging. + forcesac For SAC mode for masks <40bits (experimental) + fullflush Flush IOMMU on each allocation (default) + nofullflush Don't use IOMMU fullflush + allowed overwrite iommu off workarounds for specific chipsets. + soft Use software bounce buffering (default for Intel machines) + noaperture Don't touch the aperture for AGP. +*/ +__init int iommu_setup(char *p) +{ + iommu_merge = 1; + + while (*p) { + if (!strncmp(p,"off",3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p,"force",5)) + force_iommu = 1; + if (!strncmp(p,"noforce",7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge",8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic",5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic",7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge",5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge",7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac",8)) + iommu_sac_force = 1; + +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft",4)) + swiotlb = 1; +#endif + +#ifdef CONFIG_IOMMU + gart_parse_options(p); +#endif + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 1; +} +__setup("iommu=", iommu_setup); + +void __init pci_iommu_alloc(void) +{ + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ +#ifdef CONFIG_IOMMU + iommu_hole_init(); +#endif + +#ifdef CONFIG_CALGARY_IOMMU + detect_calgary(); +#endif + +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ +#ifdef CONFIG_CALGARY_IOMMU + calgary_iommu_init(); +#endif + +#ifdef CONFIG_IOMMU + gart_iommu_init(); +#endif + + no_iommu_init(); + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -0,0 +1,741 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/DMA-mapping.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +u32 *iommu_gatt_base; /* Remapping table */ + +/* If this is disabled the IOMMU will use an optimized flushing strategy + of only flushing when an mapping is reused. With it true the GART is flushed + for every mapping. Problem is that doing the lazy flush seems to trigger + bugs with some popular PCI cards, in particular 3ware (but has been also + also seen with Qlogic at least). */ +int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define to_pages(addr,size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static int need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(int size) +{ + unsigned long offset, flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + if (offset == -1) { + need_flush = 1; + offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); + } + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = 1; + } + } + if (iommu_fullflush) + need_flush = 1; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); + __clear_bit_string(iommu_gart_bitmap, offset, size); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + k8_flush_garts(); + need_flush = 0; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK + +#define SET_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0); +#define CLEAR_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; + +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void **iommu_leak_tab; +static int leak_trace; +int iommu_leak_pages = 20; +void dump_leak(void) +{ + int i; + static int dump; + if (dump || !iommu_leak_tab) return; + dump = 1; + show_stack(NULL,NULL); + /* Very crude. dump some from the end of the table too */ + printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i+=2) { + printk("%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); + printk("%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk("\n"); +} +#else +#define SET_LEAK(x) +#define CLEAR_LEAK(x) +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); + + if (size > PAGE_SIZE*EMERGENCY_PAGES) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); + } + +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + if (force_iommu) + mmu = 1; + return mmu; +} + +static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + return mmu; +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir) +{ + unsigned long npages = to_pages(phys_mem, size); + unsigned long iommu_page = alloc_iommu(npages); + int i; + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return bad_dma_address; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + SET_LEAK(iommu_page + i); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +static dma_addr_t gart_map_simple(struct device *dev, char *buf, + size_t size, int dir) +{ + dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); + flush_gart(); + return map; +} + +/* Map a single area into the IOMMU */ +dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +{ + unsigned long phys_mem, bus; + + BUG_ON(dir == DMA_NONE); + + if (!dev) + dev = &fallback_dev; + + phys_mem = virt_to_bus(addr); + if (!need_iommu(dev, phys_mem, size)) + return phys_mem; + + bus = gart_map_simple(dev, addr, size, dir); + return bus; +} + +/* + * Free a DMA mapping. + */ +void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + if (!s->dma_length || !s->length) + break; + gart_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + +#ifdef CONFIG_IOMMU_DEBUG + printk(KERN_DEBUG "dma_map_sg overflow\n"); +#endif + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + unsigned long addr = page_to_phys(s->page) + s->offset; + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir); + if (addr == bad_dma_address) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_page = iommu_start; + int i; + + if (iommu_start == -1) + return -1; + + for (i = start; i < stopat; i++) { + struct scatterlist *s = &sg[i]; + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(i > start && s->offset); + if (i == start) { + *sout = *s; + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + SET_LEAK(iommu_page); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; +} + +static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, + unsigned long pages, int need) +{ + if (!need) { + BUG_ON(stopat - start != 1); + *sout = sg[start]; + sout->dma_length = sg[start].length; + return 0; + } + return __dma_map_cont(sg, start, stopat, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + int out; + int start; + unsigned long pages = 0; + int need = 0, nextneed; + + BUG_ON(dir == DMA_NONE); + if (nents == 0) + return 0; + + if (!dev) + dev = &fallback_dev; + + out = 0; + start = 0; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + dma_addr_t addr = page_to_phys(s->page) + s->offset; + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + struct scatterlist *ps = &sg[i-1]; + /* Can only merge when the last chunk ends on a page + boundary and the new one doesn't have an offset. */ + if (!iommu_merge || !nextneed || !need || s->offset || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(sg, start, i, sg+out, pages, + need) < 0) + goto error; + out++; + pages = 0; + start = i; + } + } + + need = nextneed; + pages += to_pages(s->offset, s->length); + } + if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) + goto error; + out++; + flush_gart(); + if (out < nents) + sg[out].dma_length = 0; + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, nents, dir); + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); + for (i = 0; i < nents; i++) + sg[i].dma_address = bad_dma_address; + return 0; +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) + printk(KERN_WARNING + "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32; + u64 aper_base; + unsigned aper_order; + + pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x90, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size >= 0xffffffff || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_k8_gatt(struct agp_kern_info *info) +{ + struct pci_dev *dev; + void *gatt; + unsigned aper_base, new_aper_base; + unsigned aper_size, gatt_size, new_aper_size; + int i; + + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < num_k8_northbridges; i++) { + dev = k8_northbridges[i]; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + info->aper_base = aper_base; + info->aper_size = aper_size>>20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); +#ifdef CONFIG_XEN + if (xen_create_contiguous_region((unsigned long)gatt, get_order(gatt_size), 31)) + panic("Cannot create a contiguous memory region below 4GB for the GATT table"); +#endif + memset(gatt, 0, gatt_size); + agp_gatt_table = gatt; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + u32 gatt_reg; + + dev = k8_northbridges[i]; + gatt_reg = phys_to_machine(__pa(gatt)) >> 12; + gatt_reg <<= 4; + pci_write_config_dword(dev, 0x98, gatt_reg); + pci_read_config_dword(dev, 0x90, &ctl); + + ctl |= 1; + ctl &= ~((1<<4) | (1<<5)); + + pci_write_config_dword(dev, 0x90, ctl); + } + flush_gart(); + + printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + return 0; + + nommu: + /* Should not happen anymore */ + printk(KERN_ERR "PCI-DMA: init k8_gat More than 4GB of RAM and no IOMMU\n" + KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); + return -1; +} + +extern int agp_amd64_init(void); + +static struct dma_mapping_ops gart_dma_ops = { + .mapping_error = NULL, + .map_single = gart_map_single, + .map_simple = gart_map_simple, + .unmap_single = gart_unmap_single, + .sync_single_for_cpu = NULL, + .sync_single_for_device = NULL, + .sync_single_range_for_cpu = NULL, + .sync_single_range_for_device = NULL, + .sync_sg_for_cpu = NULL, + .sync_sg_for_device = NULL, + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, +}; + +void __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long aper_size; + unsigned long iommu_start; + unsigned long scratch; + long i; + + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { + printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); + return; + } + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other K8 AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (swiotlb) + return; + + /* Did we detect a different HW IOMMU? */ + if (iommu_detected && !iommu_aperture) + return; + + if (no_iommu || + (!force_iommu && !global_need_iommu()) || + !iommu_aperture || + (no_agp && init_k8_gatt(&info) < 0)) { + printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); + if (global_need_iommu()) { + printk(KERN_ERR "WARNING more than 4GB of memory " + "but IOMMU not available.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + } + return; + } + + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + memset(iommu_gart_bitmap, 0, iommu_pages/8); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages*sizeof(void *))); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); + else + printk("PCI-DMA: Cannot allocate leak trace area\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + agp_memory_reserved = iommu_size; + printk(KERN_INFO + "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size>>20); + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_address = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ +#ifndef CONFIG_XEN + clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size); +#else + /* + * In xen guests and hypervisor the IOMMU region is only mapped in + * virtual memory if it has been allocated from RAM. + */ + if (mfn_to_local_pfn(iommu_bus_base>>PAGE_SHIFT)>PAGE_SHIFT, + .arg2.nr_ents = iommu_size>>PAGE_SHIFT + }; + clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)); + } +#endif + + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(virt_to_bus(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + flush_gart(); + iommu_detected = 1; + dma_ops = &gart_dma_ops; + printk(KERN_INFO "GART: ops set\n"); +} + +void gart_parse_options(char *p) +{ + int arg; + +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p,"leak",4)) { + leak_trace = 1; + p += 4; + if (*p == '=') ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush",8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush",11)) + iommu_fullflush = 0; + if (!strncmp(p,"noagp",5)) + no_agp = 1; + if (!strncmp(p, "noaperture",10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p,"force",5)) + iommu_aperture_allowed = 1; + if (!strncmp(p,"allowed",7)) + iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/swiotlb-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/lib/swiotlb-xen.c Fri Feb 09 16:32:04 2007 -0600 @@ -0,0 +1,831 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is for IA-64 and EM64T platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick + * Copyright (C) 2000 Goutham Rao + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang + * + * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. + * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid + * unnecessary i-cache flushing. + * 04/07/.. ak Better overflow handling. Assorted fixes. + * 05/09/10 linville Add support for syncing ranges, support syncing for + * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#define OFFSET(val,align) ((unsigned long) \ + ( (val) & ( (align) - 1))) + +#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset) +#define SG_ENT_PHYS_ADDRESS(SG) virt_to_bus(SG_ENT_VIRT_ADDRESS(SG)) + +/* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? + * The complexity of {map,unmap}_single is linearly dependent on this value. + */ +#define IO_TLB_SEGSIZE 128 + +/* + * log of the size of each IO TLB slab. The number of slabs is command line + * controllable. + */ +#define IO_TLB_SHIFT 11 + +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) + +/* + * Minimum IO TLB size to bother booting with. Systems with mainly + * 64bit capable cards will only lightly use the swiotlb. If we can't + * allocate a contiguous 1MB, we're probably in trouble anyway. + */ +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + +/* + * Enumeration for sync targets + */ +enum dma_sync_target { + SYNC_FOR_CPU = 0, + SYNC_FOR_DEVICE = 1, +}; + +int swiotlb_force; + +/* + * Used to do a quick range check in swiotlb_unmap_single and + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static char *io_tlb_start, *io_tlb_end; + +/* + * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and + * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. + */ +static unsigned long io_tlb_nslabs; + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +void *io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +static unsigned char **io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static int __init +setup_io_tlb_npages(char *str) +{ + if (isdigit(*str)) { + io_tlb_nslabs = simple_strtoul(str, &str, 0); + /* avoid tail segment of size < IO_TLB_SEGSIZE */ + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + if (*str == ',') + ++str; + if (!strcmp(str, "force")) + swiotlb_force = 1; + return 1; +} +__setup("swiotlb=", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void +swiotlb_init_with_default_size (size_t default_size) +{ + unsigned long i; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + /* + * Get IO TLB memory from the low pages + */ + io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT)); + if (!io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); +#ifdef CONFIG_XEN + if (xen_create_contiguous_region((unsigned long)io_tlb_start, get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT)), 31)) + panic("Cannot create a contiguous memory region below 4GB for the SWIOTLB buffer"); +#endif +//TODO ulim: make contiguous + io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); + for (i = 0; i < io_tlb_nslabs; i++) + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_index = 0; + io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *)); + + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); +#ifdef CONFIG_XEN + if (io_tlb_overflow_buffer) + xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31); +#endif +//TODO ulim: make contiguous + printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n", + virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end)); +} + +void +swiotlb_init (void) +{ + swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ +} + +/* + * Systems with larger DMA zones (those that don't support ISA) can + * initialize the swiotlb later using the slab allocator if needed. + * This should be just like above, but with some error catching. + */ +int +swiotlb_late_init_with_default_size (size_t default_size) +{ + unsigned long i, req_nslabs = io_tlb_nslabs; + unsigned int order; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + /* + * Get IO TLB memory from the low pages + */ + order = get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT)); + io_tlb_nslabs = SLABS_PER_PAGE << order; + + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { +//TODO ulim: make contiguous + io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); +#ifdef CONFIG_XEN + if (io_tlb_start&&!xen_create_contiguous_region((unsigned long)io_tlb_start, order, 31)) { + free_pages((unsigned long)io_tlb_start, order); + io_tlb_start = NULL; + } +#endif + if (io_tlb_start) + break; + order--; + } + + if (!io_tlb_start) + goto cleanup1; + + if (order != get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT))) { + printk(KERN_WARNING "Warning: only able to allocate %ld MB " + "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + io_tlb_nslabs = SLABS_PER_PAGE << order; + } + io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT); + memset(io_tlb_start, 0, io_tlb_nslabs * (1 << IO_TLB_SHIFT)); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * sizeof(int))); + if (!io_tlb_list) + goto cleanup2; + + for (i = 0; i < io_tlb_nslabs; i++) + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_index = 0; + + io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * sizeof(char *))); + if (!io_tlb_orig_addr) + goto cleanup3; + + memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *)); + + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); +#ifdef CONFIG_XEN + if (io_tlb_overflow_buffer&&!xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31)) { + free_pages((unsigned long)io_tlb_overflow_buffer, order); + io_tlb_overflow_buffer = NULL; + } +#endif + if (!io_tlb_overflow_buffer) + goto cleanup4; + +//TODO ulim: make contiguous + printk(KERN_INFO "Placing %ldMB software IO TLB between 0x%lx - " + "0x%lx\n", (io_tlb_nslabs * (1 << IO_TLB_SHIFT)) >> 20, + virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end)); + + return 0; + +cleanup4: + free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * + sizeof(char *))); + io_tlb_orig_addr = NULL; +cleanup3: + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + io_tlb_list = NULL; + io_tlb_end = NULL; +cleanup2: +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)io_tlb_start, order); +#endif + free_pages((unsigned long)io_tlb_start, order); +//TODO ulim: make contiguous + io_tlb_start = NULL; +cleanup1: + io_tlb_nslabs = req_nslabs; + return -ENOMEM; +} + +static inline int +address_needs_mapping(struct device *hwdev, dma_addr_t addr) +{ + dma_addr_t mask = 0xffffffff; + /* If the device has a mask, use it, otherwise default to 32 bits */ + if (hwdev && hwdev->dma_mask) + mask = *hwdev->dma_mask; + return (addr & ~mask) != 0; +} + +/* + * Allocates bounce buffer and returns its kernel virtual address. + */ +static void * +map_single(struct device *hwdev, char *buffer, size_t size, int dir) +{ + unsigned long flags; + char *dma_addr; + unsigned int nslots, stride, index, wrap; + int i; + + /* + * For mappings greater than a page, we limit the stride (and + * hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size > PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + wrap = index = ALIGN(io_tlb_index, stride); + + if (index >= io_tlb_nslabs) + wrap = index = 0; + + do { + /* + * If we find a slot that indicates we have 'nslots' + * number of contiguous buffers, we allocate the + * buffers from that slot and mark the entries as '0' + * indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int) (index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in + * the next round. + */ + io_tlb_index = ((index + nslots) < io_tlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } while (index != wrap); + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return NULL; + } + found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + io_tlb_orig_addr[index] = buffer; + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + memcpy(dma_addr, buffer, size); + + return dma_addr; +} + +/* + * dma_addr is the kernel virtual address of the bounce buffer to unmap. + */ +static void +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; + char *buffer = io_tlb_orig_addr[index]; + + /* + * First, sync the memory before unmapping the entry + */ + if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + /* + * bounce... copy the data back into the original buffer * and + * delete the bounce buffer. + */ + memcpy(buffer, dma_addr, size); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contigous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) + io_tlb_list[i] = ++count; + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +static void +sync_single(struct device *hwdev, char *dma_addr, size_t size, + int dir, int target) +{ + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; + char *buffer = io_tlb_orig_addr[index]; + + switch (target) { + case SYNC_FOR_CPU: + if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + memcpy(buffer, dma_addr, size); + else + BUG_ON(dir != DMA_TO_DEVICE); + break; + case SYNC_FOR_DEVICE: + if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + memcpy(dma_addr, buffer, size); + else + BUG_ON(dir != DMA_FROM_DEVICE); + break; + default: + BUG(); + } +} + +void * +swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + unsigned long dev_addr; + void *ret; + int order = get_order(size); + + /* + * XXX fix me: the DMA API should pass us an explicit DMA mask + * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32 + * bit range instead of a 16MB one). + */ + flags |= GFP_DMA; + + ret = (void *)__get_free_pages(flags, order); + if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) { + /* + * The allocated memory isn't reachable by the device. + * Fall back on swiotlb_map_single(). + */ + free_pages((unsigned long) ret, order); + ret = NULL; + } + if (!ret) { + /* + * We are either out of memory or the device can't DMA + * to GFP_DMA memory; fall back on + * swiotlb_map_single(), which will grab memory from + * the lowest available address range. + */ + dma_addr_t handle; + handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE); + if (swiotlb_dma_mapping_error(handle)) + return NULL; + + ret = bus_to_virt(handle); + } + + memset(ret, 0, size); + dev_addr = virt_to_bus(ret); + + /* Confirm address can be DMA'd by device */ + if (address_needs_mapping(hwdev, dev_addr)) { + printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n", + (unsigned long long)*hwdev->dma_mask, dev_addr); + panic("swiotlb_alloc_coherent: allocated memory is out of " + "range for device"); + } + *dma_handle = dev_addr; + return ret; +} + +void +swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dma_handle) +{ + if (!(vaddr >= (void *)io_tlb_start + && vaddr < (void *)io_tlb_end)) + free_pages((unsigned long) vaddr, get_order(size)); + else + /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ + swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE); +} + +static void +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + printk(KERN_ERR "DMA: Out of SW-IOMMU space for %lu bytes at " + "device %s\n", size, dev ? dev->bus_id : "?"); + + if (size > io_tlb_overflow && do_panic) { + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + panic("DMA: Memory would be corrupted\n"); + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + panic("DMA: Random memory would be DMAed\n"); + } +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + */ +dma_addr_t +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) +{ + unsigned long dev_addr = virt_to_bus(ptr); + void *map; + + BUG_ON(dir == DMA_NONE); + /* + * If the pointer passed in happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + map = map_single(hwdev, ptr, size, dir); + if (!map) { + swiotlb_full(hwdev, size, dir, 1); + map = io_tlb_overflow_buffer; + } + + dev_addr = virt_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (address_needs_mapping(hwdev, dev_addr)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; +} + +/* + * Since DMA is i-cache coherent, any (complete) pages that were written via + * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to + * flush them when they get mapped into an executable vm-area. + */ +static void +mark_clean(void *addr, size_t size) +{ + unsigned long pg_addr, end; + + pg_addr = PAGE_ALIGN((unsigned long) addr); + end = (unsigned long) addr + size; + while (pg_addr + PAGE_SIZE <= end) { + struct page *page = virt_to_page(pg_addr); + set_bit(PG_arch_1, &page->flags); + pg_addr += PAGE_SIZE; + } +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_single call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +void +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, + int dir) +{ + char *dma_addr = bus_to_virt(dev_addr); + + BUG_ON(dir == DMA_NONE); + if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + unmap_single(hwdev, dma_addr, size, dir); + else if (dir == DMA_FROM_DEVICE) + mark_clean(dma_addr, size); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static inline void +swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir, int target) +{ + char *dma_addr = bus_to_virt(dev_addr); + + BUG_ON(dir == DMA_NONE); + if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + sync_single(hwdev, dma_addr, size, dir, target); + else if (dir == DMA_FROM_DEVICE) + mark_clean(dma_addr, size); +} + +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} + +/* + * Same as above, but for a sub-range of the mapping. + */ +static inline void +swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, + int dir, int target) +{ + char *dma_addr = bus_to_virt(dev_addr) + offset; + + BUG_ON(dir == DMA_NONE); + if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + sync_single(hwdev, dma_addr, size, dir, target); + else if (dir == DMA_FROM_DEVICE) + mark_clean(dma_addr, size); +} + +void +swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, int dir) +{ + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, int dir) +{ + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_DEVICE); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_single + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_single are the + * same here. + */ +int +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + void *addr; + unsigned long dev_addr; + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) { + addr = SG_ENT_VIRT_ADDRESS(sg); + dev_addr = virt_to_bus(addr); + if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) { + void *map = map_single(hwdev, addr, sg->length, dir); + sg->dma_address = virt_to_bus(map); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + swiotlb_unmap_sg(hwdev, sg - i, i, dir); + sg[0].dma_length = 0; + return 0; + } + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_single() above. + */ +void +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + unmap_single(hwdev, (void *) bus_to_virt(sg->dma_address), sg->dma_length, dir); + else if (dir == DMA_FROM_DEVICE) + mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static inline void +swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir, int target) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + sync_single(hwdev, (void *) sg->dma_address, + sg->dma_length, dir, target); +} + +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} + +int +swiotlb_dma_mapping_error(dma_addr_t dma_addr) +{ + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); +} + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported (struct device *hwdev, u64 mask) +{ + return (virt_to_bus (io_tlb_end) - 1) <= mask; +} + +EXPORT_SYMBOL(swiotlb_init); +EXPORT_SYMBOL(swiotlb_map_single); +EXPORT_SYMBOL(swiotlb_unmap_single); +EXPORT_SYMBOL(swiotlb_map_sg); +EXPORT_SYMBOL(swiotlb_unmap_sg); +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_single_for_device); +EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu); +EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); +EXPORT_SYMBOL(swiotlb_dma_mapping_error); +EXPORT_SYMBOL(swiotlb_alloc_coherent); +EXPORT_SYMBOL(swiotlb_free_coherent); +EXPORT_SYMBOL(swiotlb_dma_supported);