# HG changeset patch # User yamahata@xxxxxxxxxxxxx # Node ID d3c3aeb5f3c1d652ee68efd51621ef4e020f7c76 # Parent 37e2596eb7786735e3cff9f33adf5510b48f7ef8 make xenLinux/ia64 privcmd mmap not to use dom0 memory. xenLinux/ia64 privcmd mmap uses pseudo physical address space. it used alloc_pages() to allocate the space. It wastes dom0 memory and sometimes several hundreds megabytes is allocated depending on domU memory size. With this patch xenLinux/ia64 tris to find the region which can be used safely and uses the reasion. PATCHNAME: privcmd_mmap_pseud_physical_addr Signed-off-by: Isaku Yamahata diff -r 37e2596eb778 -r d3c3aeb5f3c1 linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c --- a/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c Fri May 19 16:56:59 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c Fri May 19 16:57:01 2006 +0900 @@ -360,49 +360,173 @@ struct address_space xen_ia64_foreign_du /////////////////////////////////////////////////////////////////////////// // foreign mapping +#include +#include // for IA64_GRANULE_SIZE, GRANULEROUND{UP,DOWN}() + +static unsigned long privcmd_resource_min = 0; +// Xen/ia64 currently can handle pseudo physical address bits up to +// (PAGE_SHIFT * 3) +static unsigned long privcmd_resource_max = GRANULEROUNDDOWN((1UL << (PAGE_SHIFT * 3)) - 1); +static unsigned long privcmd_resource_align = IA64_GRANULE_SIZE; + +static unsigned long +md_end_addr(const efi_memory_desc_t *md) +{ + return md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); +} + +#define XEN_IA64_PRIVCMD_LEAST_GAP_SIZE (1024 * 1024 * 1024UL) +static int +xen_ia64_privcmd_check_size(unsigned long start, unsigned long end) +{ + return (start < end && + (end - start) > XEN_IA64_PRIVCMD_LEAST_GAP_SIZE); +} + +static int __init +xen_ia64_privcmd_init(void) +{ + void *efi_map_start, *efi_map_end, *p; + u64 efi_desc_size; + efi_memory_desc_t *md; + unsigned long tmp_min; + unsigned long tmp_max; + unsigned long gap_size; + unsigned long prev_end; + + if (xen_init() < 0) { + return -1; + } + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + // at first check the used highest address + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + // nothing + } + md = p - efi_desc_size; + privcmd_resource_min = GRANULEROUNDUP(md_end_addr(md)); + if (xen_ia64_privcmd_check_size(privcmd_resource_min, + privcmd_resource_max)) { + goto out; + } + + // the used highest address is too large. try to find the largest gap. + tmp_min = privcmd_resource_max; + tmp_max = 0; + gap_size = 0; + prev_end = 0; + for (p = efi_map_start; + p < efi_map_end - efi_desc_size; + p += efi_desc_size) { + unsigned long end; + efi_memory_desc_t* next; + unsigned long next_start; + + md = p; + end = md_end_addr(md); + if (end > privcmd_resource_max) { + break; + } + if (end < prev_end) { + // work around. + // Xen may pass incompletely sorted memory + // descriptors like + // [x, x + length] + // [x, x] + // this order should be reversed. + continue; + } + next = p + efi_desc_size; + next_start = next->phys_addr; + if (next_start > privcmd_resource_max) { + next_start = privcmd_resource_max; + } + if (end < next_start && gap_size < (next_start - end)) { + tmp_min = end; + tmp_max = next_start; + gap_size = tmp_max - tmp_min; + } + prev_end = end; + } + + privcmd_resource_min = GRANULEROUNDUP(tmp_min); + if (xen_ia64_privcmd_check_size(privcmd_resource_min, tmp_max)) { + privcmd_resource_max = tmp_max; + goto out; + } + + privcmd_resource_min = tmp_min; + privcmd_resource_max = tmp_max; + if (!xen_ia64_privcmd_check_size(privcmd_resource_min, + privcmd_resource_max)) { + // Any large enough gap isn't found. + // go ahead anyway with the warning hoping that large region + // won't be requested. + printk(KERN_WARNING "xen privcmd: large enough region for privcmd mmap is not found.\n"); + } + +out: + printk(KERN_INFO "xen privcmd uses pseudo physical addr range [0x%lx, 0x%lx] (%ldMB)\n", + privcmd_resource_min, privcmd_resource_max, + (privcmd_resource_max - privcmd_resource_min) >> 20); + BUG_ON(privcmd_resource_min >= privcmd_resource_max); + return 0; +} +late_initcall(xen_ia64_privcmd_init); struct xen_ia64_privcmd_entry { atomic_t map_count; - struct page* page; +#define INVALID_GPFN (~0UL) + unsigned long gpfn; +}; + +struct xen_ia64_privcmd_range { + atomic_t ref_count; + unsigned long pgoff; // in PAGE_SIZE + struct resource* res; + + unsigned long num_entries; + struct xen_ia64_privcmd_entry entries[0]; +}; + +struct xen_ia64_privcmd_vma { + struct xen_ia64_privcmd_range* range; + + unsigned long num_entries; + struct xen_ia64_privcmd_entry* entries; }; static void xen_ia64_privcmd_init_entry(struct xen_ia64_privcmd_entry* entry) { atomic_set(&entry->map_count, 0); - entry->page = NULL; -} - -//TODO alloc_page() to allocate pseudo physical address space is -// waste of memory. -// When vti domain is created, qemu maps all of vti domain pages which -// reaches to several hundred megabytes at least. -// remove alloc_page(). + entry->gpfn = INVALID_GPFN; +} + static int xen_ia64_privcmd_entry_mmap(struct vm_area_struct* vma, unsigned long addr, - struct xen_ia64_privcmd_entry* entry, + struct xen_ia64_privcmd_range* privcmd_range, + int i, unsigned long mfn, pgprot_t prot, domid_t domid) { int error = 0; - struct page* page; + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i]; unsigned long gpfn; BUG_ON((addr & ~PAGE_MASK) != 0); BUG_ON(mfn == INVALID_MFN); - if (entry->page != NULL) { + if (entry->gpfn != INVALID_GPFN) { error = -EBUSY; goto out; } - page = alloc_page(GFP_KERNEL); - if (page == NULL) { - error = -ENOMEM; - goto out; - } - gpfn = page_to_pfn(page); + gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + i; error = HYPERVISOR_add_physmap(gpfn, mfn, 0/* prot:XXX */, domid); @@ -413,15 +537,13 @@ xen_ia64_privcmd_entry_mmap(struct vm_ar prot = vma->vm_page_prot; error = remap_pfn_range(vma, addr, gpfn, 1 << PAGE_SHIFT, prot); if (error != 0) { - (void)HYPERVISOR_zap_physmap(gpfn, 0); - error = HYPERVISOR_populate_physmap(gpfn, 0, 0); + error = HYPERVISOR_zap_physmap(gpfn, 0); if (error) { BUG();//XXX } - __free_page(page); } else { atomic_inc(&entry->map_count); - entry->page = page; + entry->gpfn = gpfn; } out: @@ -429,30 +551,28 @@ out: } static void -xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_entry* entry) -{ - struct page* page = entry->page; - unsigned long gpfn = page_to_pfn(page); +xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_range* privcmd_range, + int i) +{ + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i]; + unsigned long gpfn = entry->gpfn; + //gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + + // (vma->vm_pgoff - privcmd_range->pgoff); int error; error = HYPERVISOR_zap_physmap(gpfn, 0); if (error) { BUG();//XXX } - - error = HYPERVISOR_populate_physmap(gpfn, 0, 0); - if (error) { - BUG();//XXX - } - - entry->page = NULL; - __free_page(page); + entry->gpfn = INVALID_GPFN; } static int -xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_entry* entry) -{ - if (entry->page != NULL) { +xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_range* privcmd_range, + int i) +{ + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i]; + if (entry->gpfn != INVALID_GPFN) { atomic_inc(&entry->map_count); } else { BUG_ON(atomic_read(&entry->map_count) != 0); @@ -460,27 +580,15 @@ xen_ia64_privcmd_entry_open(struct xen_i } static int -xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_entry* entry) -{ - if (entry->page != NULL && atomic_dec_and_test(&entry->map_count)) { - xen_ia64_privcmd_entry_munmap(entry); - } -} - -struct xen_ia64_privcmd_range { - atomic_t ref_count; - unsigned long pgoff; // in PAGE_SIZE - - unsigned long num_entries; - struct xen_ia64_privcmd_entry entries[0]; -}; - -struct xen_ia64_privcmd_vma { - struct xen_ia64_privcmd_range* range; - - unsigned long num_entries; - struct xen_ia64_privcmd_entry* entries; -}; +xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_range* privcmd_range, + int i) +{ + struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i]; + if (entry->gpfn != INVALID_GPFN && + atomic_dec_and_test(&entry->map_count)) { + xen_ia64_privcmd_entry_munmap(privcmd_range, i); + } +} static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma); static void xen_ia64_privcmd_vma_close(struct vm_area_struct* vma); @@ -507,7 +615,7 @@ __xen_ia64_privcmd_vma_open(struct vm_ar privcmd_vma->entries = &privcmd_range->entries[entry_offset]; vma->vm_private_data = privcmd_vma; for (i = 0; i < privcmd_vma->num_entries; i++) { - xen_ia64_privcmd_entry_open(&privcmd_vma->entries[i]); + xen_ia64_privcmd_entry_open(privcmd_range, entry_offset + i); } vma->vm_private_data = privcmd_vma; @@ -533,10 +641,11 @@ xen_ia64_privcmd_vma_close(struct vm_are struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data; struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range; + unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff; unsigned long i; for (i = 0; i < privcmd_vma->num_entries; i++) { - xen_ia64_privcmd_entry_close(&privcmd_vma->entries[i]); + xen_ia64_privcmd_entry_close(privcmd_range, entry_offset + i); } vma->vm_private_data = NULL; kfree(privcmd_vma); @@ -547,9 +656,11 @@ xen_ia64_privcmd_vma_close(struct vm_are struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i]; BUG_ON(atomic_read(&entry->map_count) != 0); - BUG_ON(entry->page != NULL); + BUG_ON(entry->gpfn != INVALID_GPFN); } #endif + release_resource(privcmd_range->res); + kfree(privcmd_range->res); vfree(privcmd_range); } } @@ -557,13 +668,18 @@ int int privcmd_mmap(struct file * file, struct vm_area_struct * vma) { - unsigned long num_entries = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - struct xen_ia64_privcmd_range* privcmd_range; - struct xen_ia64_privcmd_vma* privcmd_vma; + int error; + unsigned long size = vma->vm_end - vma->vm_start; + unsigned long num_entries = size >> PAGE_SHIFT; + struct xen_ia64_privcmd_range* privcmd_range = NULL; + struct xen_ia64_privcmd_vma* privcmd_vma = NULL; + struct resource* res = NULL; unsigned long i; BUG_ON(!running_on_xen); BUG_ON(file->private_data != NULL); + + error = -ENOMEM; privcmd_range = vmalloc(sizeof(*privcmd_range) + sizeof(privcmd_range->entries[0]) * num_entries); @@ -574,6 +690,18 @@ privcmd_mmap(struct file * file, struct if (privcmd_vma == NULL) { goto out_enomem1; } + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (res == NULL) { + goto out_enomem1; + } + res->name = "Xen privcmd mmap"; + error = allocate_resource(&iomem_resource, res, size, + privcmd_resource_min, privcmd_resource_max, + privcmd_resource_align, NULL, NULL); + if (error) { + goto out_enomem1; + } + privcmd_range->res = res; /* DONTCOPY is essential for Xen as copy_page_range is broken. */ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; @@ -589,10 +717,11 @@ privcmd_mmap(struct file * file, struct return 0; out_enomem1: + kfree(res); kfree(privcmd_vma); out_enomem0: vfree(privcmd_range); - return -ENOMEM; + return error; } int @@ -605,6 +734,9 @@ direct_remap_pfn_range(struct vm_area_st { struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data; + struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range; + unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff; + unsigned long i; unsigned long offset; int error = 0; @@ -618,9 +750,7 @@ direct_remap_pfn_range(struct vm_area_st i = (address - vma->vm_start) >> PAGE_SHIFT; for (offset = 0; offset < size; offset += PAGE_SIZE) { - struct xen_ia64_privcmd_entry* entry = - &privcmd_vma->entries[i]; - error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, entry, mfn, prot, domid); + error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, privcmd_range, entry_offset + i, mfn, prot, domid); if (error != 0) { break; }