# HG changeset patch
# User akw27@xxxxxxxxxxxxxxxxxxxxxx
# Node ID eaf498f1ffdef1d63ef9df03f1d8ea749227d183
# Parent 0237746ecf92423a1b948836902f857c4cc3ddd3
Add grant table support to block tap.
This patch adds grant table support to the block tap. The AIO support
introduced in patch 9f0eff879d8913a824280cf67658a530c80e8424 still
works -- The tap code maps a granted page twice, once in kernel and
once in user. The kernel page is patched into the p2m table and pages
added to the user vm_area are mapped to the appropriate underlying
struct pages using the VM_FOREIGN hooks in get_user_pages().
Comparing block IO from dom0 to the existing block backend, and to the
tap managing the same partition as the BE from user space with AIO, I
get the following performance:
Version 1.03 ------Sequential Output------ --Sequential Input- --Random-
-Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP
xen0 2G 31198 95 56818 8 20967 2 28415 77 59595 4 264.9 0
xenU-blkbe2cpuGT 2G 31157 96 54026 10 25585 4 30664 90 64919 7 292.7 0
xenU-blktp2cpuGT 2G 32313 97 54217 8 20950 3 28117 87 65924 4 191.8 0
Signed-off-by: andrew.warfield@xxxxxxxxxxxx
diff -r 0237746ecf92 -r eaf498f1ffde
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Tue Aug 16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Tue Aug 16 10:12:18 2005
@@ -23,6 +23,9 @@
blkif_be_driver_status_t be_st;
printk(KERN_INFO "Initialising Xen block tap device\n");
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ printk(KERN_INFO "Block tap is using grant tables.\n");
+#endif
DPRINTK(" tap - Backend connection init:\n");
diff -r 0237746ecf92 -r eaf498f1ffde
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Tue Aug 16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Tue Aug 16 10:12:18 2005
@@ -85,6 +85,11 @@
spinlock_t blk_ring_lock;
atomic_t refcnt;
struct work_struct work;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ u16 shmem_handle;
+ memory_t shmem_vaddr;
+ grant_ref_t shmem_ref;
+#endif
} blkif_t;
blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
diff -r 0237746ecf92 -r eaf498f1ffde
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Tue Aug
16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Tue Aug
16 10:12:18 2005
@@ -9,6 +9,7 @@
*/
#include "blktap.h"
+#include <asm-xen/evtchn.h>
static char *blkif_state_name[] = {
[BLKIF_STATE_CLOSED] = "closed",
@@ -48,12 +49,21 @@
blkif_t *blkif = (blkif_t *)arg;
ctrl_msg_t cmsg;
blkif_be_disconnect_t disc;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ struct gnttab_unmap_grant_ref op;
+#endif
/*
* These can't be done in blkif_disconnect() because at that point there
* may be outstanding requests at the disc whose asynchronous responses
* must still be notified to the remote driver.
*/
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ op.host_addr = blkif->shmem_vaddr;
+ op.handle = blkif->shmem_handle;
+ op.dev_bus_addr = 0;
+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+#endif
vfree(blkif->blk_ring.sring);
/* Construct the deferred response message. */
@@ -177,8 +187,12 @@
unsigned int evtchn = connect->evtchn;
unsigned long shmem_frame = connect->shmem_frame;
struct vm_struct *vma;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ int ref = connect->shmem_ref;
+#else
pgprot_t prot;
int error;
+#endif
blkif_t *blkif;
blkif_sring_t *sring;
@@ -199,24 +213,46 @@
return;
}
- prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+#ifndef CONFIG_XEN_BLKDEV_GRANT
+ prot = __pgprot(_KERNPG_TABLE);
error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
prot, domid);
if ( error != 0 )
{
- WPRINTK("BE_CONNECT: error! (%d)\n", error);
if ( error == -ENOMEM )
connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- else if ( error == -EFAULT ) {
+ else if ( error == -EFAULT )
connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
- WPRINTK("BE_CONNECT: MAPPING error!\n");
- }
else
connect->status = BLKIF_BE_STATUS_ERROR;
vfree(vma->addr);
return;
}
+#else
+ { /* Map: Use the Grant table reference */
+ struct gnttab_map_grant_ref op;
+ op.host_addr = VMALLOC_VMADDR(vma->addr);
+ op.flags = GNTMAP_host_map;
+ op.ref = ref;
+ op.dom = domid;
+
+ BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
+
+ handle = op.handle;
+
+ if (op.handle < 0) {
+ DPRINTK(" Grant table operation failure !\n");
+ connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+ vfree(vma->addr);
+ return;
+ }
+
+ blkif->shmem_ref = ref;
+ blkif->shmem_handle = handle;
+ blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr);
+ }
+#endif
if ( blkif->status != DISCONNECTED )
{
diff -r 0237746ecf92 -r eaf498f1ffde
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Tue Aug 16
07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Tue Aug 16
10:12:18 2005
@@ -21,6 +21,9 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+#include <asm-xen/xen-public/grant_table.h>
+#endif
#include "blktap.h"
@@ -42,6 +45,7 @@
/* local prototypes */
static int blktap_read_fe_ring(void);
static int blktap_read_be_ring(void);
+
/* -------[ mmap region ]--------------------------------------------- */
/*
@@ -73,7 +77,28 @@
((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
((_seg) * PAGE_SIZE))
-
+/* -------[ grant handles ]------------------------------------------- */
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+ u16 kernel;
+ u16 user;
+};
+static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
+#define pending_handle(_idx, _i) \
+ (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
+#define BLKTAP_INVALID_HANDLE(_g) \
+ (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
+ (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+ } while(0)
+
+#endif
/* -------[ blktap vm ops ]------------------------------------------- */
@@ -348,9 +373,43 @@
/*-----[ Data to/from user space ]----------------------------------------*/
-
static void fast_flush_area(int idx, int nr_pages)
{
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int i, op = 0;
+ struct grant_handle_pair *handle;
+ unsigned long ptep;
+
+ for (i=0; i<nr_pages; i++)
+ {
+ handle = &pending_handle(idx, i);
+ if (!BLKTAP_INVALID_HANDLE(handle))
+ {
+
+ unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
+ unmap[op].dev_bus_addr = 0;
+ unmap[op].handle = handle->kernel;
+ op++;
+
+ if (create_lookup_pte_addr(blktap_vma->vm_mm,
+ MMAP_VADDR(user_vstart, idx, i),
+ &ptep) !=0) {
+ DPRINTK("Couldn't get a pte addr!\n");
+ return;
+ }
+ unmap[op].host_addr = ptep;
+ unmap[op].dev_bus_addr = 0;
+ unmap[op].handle = handle->user;
+ op++;
+
+ BLKTAP_INVALIDATE_HANDLE(handle);
+ }
+ }
+ if ( unlikely(HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, op)))
+ BUG();
+#else
multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int i;
@@ -363,21 +422,22 @@
mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
BUG();
-}
-
-
-extern int __direct_remap_area_pages(struct mm_struct *mm,
- unsigned long address,
- unsigned long size,
- mmu_update_t *v);
+#endif
+}
+
int blktap_write_fe_ring(blkif_request_t *req)
{
blkif_request_t *target;
- int i;
+ int i, ret = 0;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ int op;
+#else
unsigned long remap_prot;
multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+#endif
/*
* This is called to pass a request from the real frontend domain's
@@ -394,18 +454,109 @@
return 0;
}
- remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
flush_cache_all(); /* a noop on intel... */
target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
memcpy(target, req, sizeof(*req));
/* Map the foreign pages directly in to the application */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ op = 0;
+ for (i=0; i<target->nr_segments; i++) {
+
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long ptep;
+
+ uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
+ kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+
+ /* Map the remote page to kernel. */
+ map[op].host_addr = kvaddr;
+ map[op].dom = ID_TO_DOM(req->id);
+ map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]);
+ map[op].flags = GNTMAP_host_map;
+ /* This needs a bit more thought in terms of interposition:
+ * If we want to be able to modify pages during write using
+ * grant table mappings, the guest will either need to allow
+ * it, or we'll need to incur a copy. */
+ if (req->operation == BLKIF_OP_WRITE)
+ map[op].flags |= GNTMAP_readonly;
+ op++;
+
+ /* Now map it to user. */
+ ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+ if (ret)
+ {
+ DPRINTK("Couldn't get a pte addr!\n");
+ goto fail;
+ }
+
+ map[op].host_addr = ptep;
+ map[op].dom = ID_TO_DOM(req->id);
+ map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]);
+ map[op].flags = GNTMAP_host_map | GNTMAP_application_map
+ | GNTMAP_contains_pte;
+ /* Above interposition comment applies here as well. */
+ if (req->operation == BLKIF_OP_WRITE)
+ map[op].flags |= GNTMAP_readonly;
+ op++;
+ }
+
+ if ( unlikely(HYPERVISOR_grant_table_op(
+ GNTTABOP_map_grant_ref, map, op)))
+ BUG();
+
+ op = 0;
+ for (i=0; i<(target->nr_segments*2); i+=2) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+ int cancel = 0;
+
+ uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2);
+ kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2);
+
+ if ( unlikely(map[i].handle < 0) ) {
+ DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
+ ret = map[i].handle;
+ cancel = 1;
+ }
+
+ if ( unlikely(map[i+1].handle < 0) ) {
+ DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
+ ret = map[i+1].handle;
+ cancel = 1;
+ }
+
+ if (cancel)
+ goto fail;
+
+ /* Set the necessary mappings in p2m and in the VM_FOREIGN
+ * vm_area_struct to allow user vaddr -> struct page lookups
+ * to work. This is needed for direct IO to foreign pages. */
+ phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
+ FOREIGN_FRAME(map[i].dev_bus_addr);
+
+ offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+ ((struct page **)blktap_vma->vm_private_data)[offset] =
+ pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+
+ /* Save handles for unmapping later. */
+ pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle;
+ pending_handle(ID_TO_IDX(req->id), i/2).user = map[i+1].handle;
+ }
+
+#else
+
+ remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+
for (i=0; i<target->nr_segments; i++) {
unsigned long buf;
unsigned long uvaddr;
unsigned long kvaddr;
unsigned long offset;
+ unsigned long ptep;
buf = target->frame_and_sects[i] & PAGE_MASK;
uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
@@ -421,10 +572,14 @@
phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
FOREIGN_FRAME(buf >> PAGE_SHIFT);
- __direct_remap_area_pages(blktap_vma->vm_mm,
- uvaddr,
- PAGE_SIZE,
- &mmu[i]);
+ ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+ if (ret)
+ {
+ DPRINTK("error getting pte\n");
+ goto fail;
+ }
+
+ mmu[i].ptr = ptep;
mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
| pgprot_val(blktap_vma->vm_page_prot);
@@ -448,16 +603,17 @@
if ( unlikely(mcl[i].result != 0) )
{
DPRINTK("invalid buffer -- could not remap it\n");
- fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
- return -1;
+ ret = mcl[i].result;
+ goto fail;
}
}
if ( unlikely(mcl[i].result != 0) )
{
DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
- return -1;
- }
-
+ ret = mcl[i].result;
+ goto fail;
+ }
+#endif /* CONFIG_XEN_BLKDEV_GRANT */
/* Mark mapped pages as reserved: */
for ( i = 0; i < target->nr_segments; i++ )
@@ -472,6 +628,10 @@
blktap_ufe_ring.req_prod_pvt++;
return 0;
+
+ fail:
+ fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
+ return ret;
}
int blktap_write_be_ring(blkif_response_t *rsp)
@@ -538,11 +698,10 @@
map[offset] = NULL;
}
-
+ fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
zap_page_range(blktap_vma,
MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0),
ar->nr_pages << PAGE_SHIFT, NULL);
- fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
write_resp_to_fe_ring(blkif, resp_s);
blktap_ufe_ring.rsp_cons = i + 1;
kick_fe_domain(blkif);
@@ -616,10 +775,16 @@
int blktap_init(void)
{
- int err;
+ int err, i, j;
if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
BUG();
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+ for (i=0; i<MAX_PENDING_REQS ; i++)
+ for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+ BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
+#endif
err = misc_register(&blktap_miscdev);
if ( err != 0 )
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|