Subject: libxc: use new (replacement) mmap-batch ioctl Replace all calls to xc_map_foreign_batch() where the caller doesn't look at the passed in array to check for errors by calls to xc_map_foreign_pages(). Replace all remaining calls by such to the newly introduced xc_map_foreign_bulk(). As a sideband modification (needed while writing the patch to ensure they're unused) eliminate unused parameters to uncanonicalize_pagetable() and xc_map_foreign_batch_single(). Also unmap live_p2m_frame_list earlier in map_and_save_p2m_table(), reducing the peak amount of virtual address space required. All supported OSes other than Linux continue to use the old ioctl for the time being. Also change libxc's MAJOR to 4.0 to reflect the API change? Signed-off-by: Jan Beulich --- 2010-01-06.orig/tools/include/xen-sys/Linux/privcmd.h 2010-01-12 10:13:31.000000000 +0100 +++ 2010-01-06/tools/include/xen-sys/Linux/privcmd.h 2010-01-12 10:20:06.000000000 +0100 @@ -64,6 +64,14 @@ typedef struct privcmd_mmapbatch { xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ } privcmd_mmapbatch_t; +typedef struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ +} privcmd_mmapbatch_v2_t; + /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t @@ -75,5 +83,7 @@ typedef struct privcmd_mmapbatch { _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ --- 2010-01-06.orig/tools/libxc/Makefile 2009-12-17 12:20:22.000000000 +0100 +++ 2010-01-06/tools/libxc/Makefile 2010-01-12 10:18:08.000000000 +0100 @@ -1,7 +1,7 @@ XEN_ROOT = ../.. include $(XEN_ROOT)/tools/Rules.mk -MAJOR = 3.4 +MAJOR = 4.0 MINOR = 0 CTRL_SRCS-y := --- 2010-01-06.orig/tools/libxc/xc_domain_restore.c 2010-01-12 10:13:31.000000000 +0100 +++ 2010-01-06/tools/libxc/xc_domain_restore.c 2010-01-08 16:36:32.000000000 +0100 @@ -118,7 +118,7 @@ static int break_super_page(int xc_handl page_array[i] = start_pfn + i; } - ram_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ, + ram_base = xc_map_foreign_pages(xc_handle, dom, PROT_READ, page_array, tot_pfns); if ( ram_base == NULL ) @@ -166,7 +166,7 @@ static int break_super_page(int xc_handl page_array[i] = start_pfn + i; } - ram_base = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE, + ram_base = xc_map_foreign_pages(xc_handle, dom, PROT_WRITE, page_array, tot_pfns); if ( ram_base == NULL ) { @@ -494,7 +494,7 @@ static ssize_t read_exact_timed(int fd, ** the (now known) appropriate mfn values. */ static int uncanonicalize_pagetable(int xc_handle, uint32_t dom, struct restore_ctx *ctx, - unsigned long type, void *page, int superpages) + void *page, int superpages) { int i, pte_last; unsigned long pfn; @@ -1186,7 +1186,7 @@ static int apply_batch(int xc_handle, ui } /* Map relevant mfns */ - region_base = xc_map_foreign_batch( + region_base = xc_map_foreign_pages( xc_handle, dom, PROT_WRITE, region_mfn, j); if ( region_base == NULL ) @@ -1240,7 +1240,7 @@ static int apply_batch(int xc_handle, ui (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) { if (!uncanonicalize_pagetable(xc_handle, dom, ctx, - pagetype, page, superpages)) { + page, superpages)) { /* ** Failing to uncanonicalize a page table can be ok ** under live migration since the pages type may have @@ -1655,7 +1655,7 @@ int xc_domain_restore(int xc_handle, int if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) ) { - region_base = xc_map_foreign_batch( + region_base = xc_map_foreign_pages( xc_handle, dom, PROT_READ | PROT_WRITE, region_mfn, j); if ( region_base == NULL ) { @@ -1666,7 +1666,7 @@ int xc_domain_restore(int xc_handle, int for ( k = 0; k < j; k++ ) { if ( !uncanonicalize_pagetable( - xc_handle, dom, ctx, XEN_DOMCTL_PFINFO_L1TAB, + xc_handle, dom, ctx, region_base + k*PAGE_SIZE, superpages) ) { ERROR("failed uncanonicalize pt!"); @@ -1949,7 +1949,7 @@ int xc_domain_restore(int xc_handle, int } /* Copy the P2M we've constructed to the 'live' P2M */ - if ( !(ctx->live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE, + if ( !(ctx->live_p2m = xc_map_foreign_pages(xc_handle, dom, PROT_WRITE, p2m_frame_list, P2M_FL_ENTRIES)) ) { ERROR("Couldn't map p2m table"); --- 2010-01-06.orig/tools/libxc/xc_domain_save.c 2010-01-12 10:13:31.000000000 +0100 +++ 2010-01-06/tools/libxc/xc_domain_save.c 2010-01-11 14:39:23.000000000 +0100 @@ -709,7 +709,7 @@ static xen_pfn_t *map_and_save_p2m_table p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i]; live_p2m_frame_list = - xc_map_foreign_batch(xc_handle, dom, PROT_READ, + xc_map_foreign_pages(xc_handle, dom, PROT_READ, p2m_frame_list_list, P2M_FLL_ENTRIES); if ( !live_p2m_frame_list ) @@ -727,6 +727,9 @@ static xen_pfn_t *map_and_save_p2m_table memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE); memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE); + munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); + live_p2m_frame_list = NULL; + /* Canonicalize guest's unsigned long vs ours */ if ( dinfo->guest_width > sizeof(unsigned long) ) for ( i = 0; i < P2M_FL_ENTRIES; i++ ) @@ -741,7 +744,7 @@ static xen_pfn_t *map_and_save_p2m_table (its not clear why it would want to change them, and we'll be OK from a safety POV anyhow. */ - p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ, + p2m = xc_map_foreign_pages(xc_handle, dom, PROT_READ, p2m_frame_list, P2M_FL_ENTRIES); if ( !p2m ) @@ -873,8 +876,9 @@ int xc_domain_save(int xc_handle, int io vcpu_guest_context_any_t ctxt; /* A table containing the type of each PFN (/not/ MFN!). */ - unsigned long *pfn_type = NULL; + xen_pfn_t *pfn_type = NULL; unsigned long *pfn_batch = NULL; + int *pfn_err = NULL; /* A copy of one frame of guest memory. */ char page[PAGE_SIZE]; @@ -1263,8 +1267,8 @@ int xc_domain_save(int xc_handle, int io if ( batch == 0 ) goto skip; /* vanishingly unlikely... */ - region_base = xc_map_foreign_batch( - xc_handle, dom, PROT_READ, pfn_type, batch); + region_base = xc_map_foreign_bulk( + xc_handle, dom, PROT_READ, pfn_type, pfn_err, batch); if ( region_base == NULL ) { ERROR("map batch failed"); @@ -1275,14 +1279,19 @@ int xc_domain_save(int xc_handle, int io { /* Look for and skip completely empty batches. */ for ( j = 0; j < batch; j++ ) - if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) != - XEN_DOMCTL_PFINFO_XTAB ) + { + if ( !pfn_err[j] ) break; + pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB; + } if ( j == batch ) { munmap(region_base, batch*PAGE_SIZE); continue; /* bail on this batch: no valid pages */ } + for ( ; j < batch; j++ ) + if ( pfn_err[j] ) + pfn_type[j] |= XEN_DOMCTL_PFINFO_XTAB; } else { @@ -1296,16 +1305,17 @@ int xc_domain_save(int xc_handle, int io goto out; } for ( j = batch-1; j >= 0; j-- ) - pfn_type[j] = ((uint32_t *)pfn_type)[j]; + pfn_type[j] = ((uint32_t *)pfn_type)[j] & + XEN_DOMCTL_PFINFO_LTAB_MASK; for ( j = 0; j < batch; j++ ) { + unsigned long mfn = pfn_to_mfn(pfn_batch[j]); - if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) == - XEN_DOMCTL_PFINFO_XTAB ) + if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB ) { DPRINTF("type fail: page %i mfn %08lx\n", - j, pfn_type[j]); + j, mfn); continue; } @@ -1313,16 +1323,13 @@ int xc_domain_save(int xc_handle, int io DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx" " sum= %08lx\n", iter, - (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | - pfn_batch[j], - pfn_type[j], - mfn_to_pfn(pfn_type[j] & - ~XEN_DOMCTL_PFINFO_LTAB_MASK), + pfn_type[j] | pfn_batch[j], + mfn, + mfn_to_pfn(mfn), csum_page(region_base + (PAGE_SIZE*j))); /* canonicalise mfn->pfn */ - pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | - pfn_batch[j]; + pfn_type[j] |= pfn_batch[j]; } } @@ -1332,11 +1339,17 @@ int xc_domain_save(int xc_handle, int io goto out; } + if ( sizeof(unsigned long) < sizeof(*pfn_type) ) + for ( j = 0; j < batch; j++ ) + ((unsigned long *)pfn_type)[j] = pfn_type[j]; if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) ) { PERROR("Error when writing to state file (3)"); goto out; } + if ( sizeof(unsigned long) < sizeof(*pfn_type) ) + while ( --j >= 0 ) + pfn_type[j] = ((unsigned long *)pfn_type)[j]; /* entering this loop, pfn_type is now in pfns (Not mfns) */ run = 0; --- 2010-01-06.orig/tools/libxc/xc_linux.c 2010-01-12 10:13:31.000000000 +0100 +++ 2010-01-06/tools/libxc/xc_linux.c 2010-01-12 10:19:59.000000000 +0100 @@ -64,7 +64,7 @@ int xc_interface_close(int xc_handle) return close(xc_handle); } -static int xc_map_foreign_batch_single(int xc_handle, uint32_t dom, int prot, +static int xc_map_foreign_batch_single(int xc_handle, uint32_t dom, xen_pfn_t *mfn, unsigned long addr) { privcmd_mmapbatch_t ioctlx; @@ -116,7 +116,7 @@ void *xc_map_foreign_batch(int xc_handle XEN_DOMCTL_PFINFO_PAGEDTAB ) { unsigned long paged_addr = (unsigned long)addr + (i << PAGE_SHIFT); - rc = xc_map_foreign_batch_single(xc_handle, dom, prot, &arr[i], + rc = xc_map_foreign_batch_single(xc_handle, dom, &arr[i], paged_addr); if ( rc < 0 ) goto out; @@ -137,6 +137,131 @@ void *xc_map_foreign_batch(int xc_handle return addr; } +void *xc_map_foreign_bulk(int xc_handle, uint32_t dom, int prot, + const xen_pfn_t *arr, int *err, unsigned int num) +{ + privcmd_mmapbatch_v2_t ioctlx; + void *addr; + unsigned int i; + int rc; + + addr = mmap(NULL, (unsigned long)num << PAGE_SHIFT, prot, MAP_SHARED, + xc_handle, 0); + if ( addr == MAP_FAILED ) + { + perror("xc_map_foreign_batch: mmap failed"); + return NULL; + } + + ioctlx.num = num; + ioctlx.dom = dom; + ioctlx.addr = (unsigned long)addr; + ioctlx.arr = arr; + ioctlx.err = err; + + rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAPBATCH_V2, &ioctlx); + + if ( rc < 0 && errno == ENOENT ) + { + for ( i = rc = 0; rc == 0 && i < num; i++ ) + { + if ( err[i] != -ENOENT ) + continue; + + ioctlx.num = 1; + ioctlx.dom = dom; + ioctlx.addr = (unsigned long)addr + ((unsigned long)i< 0 ) + { + /* + * IOCTL_PRIVCMD_MMAPBATCH_V2 is not supported - fall back to + * IOCTL_PRIVCMD_MMAPBATCH. + */ + xen_pfn_t *pfn = calloc(num, sizeof(*pfn)); + + if ( pfn ) + { + privcmd_mmapbatch_t ioctlx; + + memcpy(pfn, arr, num * sizeof(*arr)); + + ioctlx.num = num; + ioctlx.dom = dom; + ioctlx.addr = (unsigned long)addr; + ioctlx.arr = pfn; + + rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx); + + rc = rc < 0 ? -errno : 0; + + for ( i = 0; i < num; ++i ) + { + switch ( pfn[i] ^ arr[i] ) + { + case 0: + err[i] = rc != -ENOENT ? rc : 0; + continue; + default: + err[i] = -EINVAL; + continue; + case XEN_DOMCTL_PFINFO_PAGEDTAB: + if ( rc != -ENOENT ) + { + err[i] = rc ?: -EINVAL; + continue; + } + rc = xc_map_foreign_batch_single(xc_handle, dom, pfn + i, + (unsigned long)addr + ((unsigned long)i<gfn; ret = -EFAULT; - page = xc_map_foreign_batch(paging->xc_handle, victim->domain_id, + page = xc_map_foreign_pages(paging->xc_handle, victim->domain_id, PROT_READ | PROT_WRITE, &gfn, 1); - if ( (gfn & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB ) + if ( page == NULL ) { ERROR("Error mapping page"); goto out; @@ -386,7 +386,7 @@ int xenpaging_populate_page(xenpaging_t /* Map page */ ret = -EFAULT; - page = xc_map_foreign_batch(paging->xc_handle, paging->mem_event.domain_id, + page = xc_map_foreign_pages(paging->xc_handle, paging->mem_event.domain_id, PROT_READ | PROT_WRITE, gfn, 1); if ( page == NULL ) { @@ -394,13 +394,6 @@ int xenpaging_populate_page(xenpaging_t goto out_map; } - /* Check that the page exists */ - if ( (*gfn & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB ) - { - ERROR("Error mapping page: gfn is invalid"); - goto out; - } - /* Read page */ ret = read_page(fd, page, i); if ( ret != 0 )