# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 8004acaa668454c75c4d02d634b2af3a84f6f8c1
# Parent 43f424818d6ef3d3c877774b03e39fe47c8c094a
# Parent 9f0eff879d8913a824280cf67658a530c80e8424
Merge
diff -r 43f424818d6e -r 8004acaa6684
linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Thu Aug 4 16:53:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Thu Aug 4 16:53:30 2005
@@ -137,7 +137,7 @@
blkif_control_probe_send(&req, &rsp,
(unsigned long)(virt_to_machine(buf)));
#else
- req.frame_and_sects[0] = blkif_fas(virt_to_machine(buf), 0,
((PAGE_SIZE/512)-1);
+ req.frame_and_sects[0] = blkif_fas(virt_to_machine(buf), 0,
(PAGE_SIZE/512)-1);
blkif_control_send(&req, &rsp);
#endif
diff -r 43f424818d6e -r 8004acaa6684
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Thu Aug 4 16:53:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Thu Aug 4 16:53:30 2005
@@ -103,8 +103,6 @@
blkif_t *blkif;
unsigned long id;
int nr_pages;
- unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
int next_free;
} active_req_t;
@@ -172,32 +170,7 @@
/* -------[ Mappings to User VMA ]------------------------------------ */
-#define MAX_PENDING_REQS 64
#define BATCH_PER_DOMAIN 16
-extern struct vm_area_struct *blktap_vma;
-
-/* The following are from blkback.c and should probably be put in a
- * header and included from there.
- * The mmap area described here is where attached data pages eill be mapped.
- */
-
-extern unsigned long mmap_vstart;
-#define MMAP_PAGES_PER_REQUEST \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES \
- (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg) \
- (mmap_vstart + \
- ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * PAGE_SIZE))
-
-/* immediately before the mmap area, we have a bunch of pages reserved
- * for shared memory rings.
- */
-
-#define RING_PAGES 3 /* Ctrl, Front, and Back */
-extern unsigned long rings_vstart;
-
/* -------[ Here be globals ]----------------------------------------- */
extern unsigned long blktap_mode;
diff -r 43f424818d6e -r 8004acaa6684
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Thu Aug 4
16:53:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Thu Aug 4
16:53:30 2005
@@ -280,8 +280,6 @@
int more_to_do = 0;
int notify_be = 0, notify_user = 0;
- if (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) return 1;
-
/* lock both rings */
spin_lock_irqsave(&blkif_io_lock, flags);
diff -r 43f424818d6e -r 8004acaa6684
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Thu Aug 4
16:53:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Thu Aug 4
16:53:30 2005
@@ -19,6 +19,7 @@
#include <linux/gfp.h>
#include <linux/poll.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
#include "blktap.h"
@@ -32,11 +33,6 @@
/* for poll: */
static wait_queue_head_t blktap_wait;
-
-/* Where things are inside the device mapping. */
-struct vm_area_struct *blktap_vma = NULL;
-unsigned long mmap_vstart;
-unsigned long rings_vstart;
/* Rings up to user space. */
static blkif_front_ring_t blktap_ufe_ring;
@@ -47,6 +43,39 @@
static int blktap_read_fe_ring(void);
static int blktap_read_be_ring(void);
+/* -------[ mmap region ]--------------------------------------------- */
+/*
+ * We use a big chunk of address space to map in-flight requests into,
+ * and export this region up to user-space. See the comments in blkback
+ * about this -- the two must be kept in sync if the tap is used as a
+ * passthrough.
+ */
+
+#define MAX_PENDING_REQS 64
+
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+#define RING_PAGES 3 /* Ctrl, Front, and Back */
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma = NULL;
+unsigned long mmap_vstart; /* Kernel pages for mapping in data. */
+unsigned long rings_vstart; /* start of mmaped vma */
+unsigned long user_vstart; /* start of user mappings */
+
+#define MMAP_PAGES_PER_REQUEST \
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg) \
+ ( _start + \
+ ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+
+
+
+
/* -------[ blktap vm ops ]------------------------------------------- */
static struct page *blktap_nopage(struct vm_area_struct *vma,
@@ -76,8 +105,6 @@
if ( test_and_set_bit(0, &blktap_dev_inuse) )
return -EBUSY;
-
- printk(KERN_ALERT "blktap open.\n");
/* Allocate the ctrl ring. */
csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL);
@@ -128,7 +155,7 @@
blktap_dev_inuse = 0;
blktap_ring_ok = 0;
- printk(KERN_ALERT "blktap closed.\n");
+ DPRINTK(KERN_ALERT "blktap closed.\n");
/* Free the ring page. */
ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring));
@@ -140,7 +167,7 @@
ClearPageReserved(virt_to_page(blktap_ube_ring.sring));
free_page((unsigned long) blktap_ube_ring.sring);
- /* Clear any active mappings. */
+ /* Clear any active mappings and free foreign map table */
if (blktap_vma != NULL) {
zap_page_range(blktap_vma, blktap_vma->vm_start,
blktap_vma->vm_end - blktap_vma->vm_start, NULL);
@@ -151,21 +178,36 @@
}
/* Note on mmap:
- * remap_pfn_range sets VM_IO on vma->vm_flags. In trying to make libaio
- * work to do direct page access from userspace, this ended up being a
- * problem. The bigger issue seems to be that there is no way to map
- * a foreign page in to user space and have the virtual address of that
- * page map sanely down to a mfn.
- * Removing the VM_IO flag results in a loop in get_user_pages, as
- * pfn_valid() always fails on a foreign page.
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them. This couldn't be done before, because
+ * there isn't really a sane way to make a user virtual address down to a
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space. This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms. vma->vm_private_data is set up as a mapping
+ * from pages to actual page structs. There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ *
+ * blktap_mmap sets up this mapping. Most of the real work is done in
+ * blktap_write_fe_ring below.
*/
static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
{
int size;
-
- printk(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+ struct page **map;
+ int i;
+
+ DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
vma->vm_start, vma->vm_end);
+ vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &blktap_vm_ops;
size = vma->vm_end - vma->vm_start;
@@ -177,10 +219,10 @@
}
size >>= PAGE_SHIFT;
- printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+ DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
rings_vstart = vma->vm_start;
- mmap_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+ user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
/* Map the ring pages to the start of the region and reserve it. */
@@ -190,29 +232,44 @@
DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring));
if (remap_pfn_range(vma, vma->vm_start,
__pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot)) {
- WPRINTK("ctrl_ring: remap_pfn_range failure!\n");
- }
+ PAGE_SIZE, vma->vm_page_prot))
+ goto fail;
DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE,
__pa(blktap_ube_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot)) {
- WPRINTK("be_ring: remap_pfn_range failure!\n");
- }
+ PAGE_SIZE, vma->vm_page_prot))
+ goto fail;
DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ),
__pa(blktap_ufe_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot)) {
- WPRINTK("fe_ring: remap_pfn_range failure!\n");
- }
-
+ PAGE_SIZE, vma->vm_page_prot))
+ goto fail;
+
+ /* Mark this VM as containing foreign pages, and set up mappings. */
+ map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+ * sizeof(struct page_struct*),
+ GFP_KERNEL);
+ if (map == NULL) goto fail;
+
+ for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+ map[i] = NULL;
+
+ vma->vm_private_data = map;
+ vma->vm_flags |= VM_FOREIGN;
+
blktap_vma = vma;
blktap_ring_ok = 1;
return 0;
+ fail:
+ /* Clear any active mappings. */
+ zap_page_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
+
+ return -ENOMEM;
}
static int blktap_ioctl(struct inode *inode, struct file *filp,
@@ -263,6 +320,8 @@
RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) ||
RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) {
+ flush_tlb_all();
+
RING_PUSH_REQUESTS(&blktap_uctrl_ring);
RING_PUSH_REQUESTS(&blktap_ufe_ring);
RING_PUSH_RESPONSES(&blktap_ube_ring);
@@ -290,10 +349,35 @@
/*-----[ Data to/from user space ]----------------------------------------*/
+static void fast_flush_area(int idx, int nr_pages)
+{
+ multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int i;
+
+ for ( i = 0; i < nr_pages; i++ )
+ {
+ MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i),
+ __pte(0), 0);
+ }
+
+ mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+ if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
+ BUG();
+}
+
+
+extern int __direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t *v);
+
int blktap_write_fe_ring(blkif_request_t *req)
{
blkif_request_t *target;
- int error, i;
+ int i;
+ unsigned long remap_prot;
+ multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
+ mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
/*
* This is called to pass a request from the real frontend domain's
@@ -310,26 +394,81 @@
return 0;
}
- target = RING_GET_REQUEST(&blktap_ufe_ring,
- blktap_ufe_ring.req_prod_pvt);
+ remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+ flush_cache_all(); /* a noop on intel... */
+
+ target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
memcpy(target, req, sizeof(*req));
- /* Attempt to map the foreign pages directly in to the application */
+ /* Map the foreign pages directly in to the application */
for (i=0; i<target->nr_segments; i++) {
-
- error = direct_remap_area_pages(blktap_vma->vm_mm,
- MMAP_VADDR(ID_TO_IDX(req->id), i),
- target->frame_and_sects[i] & PAGE_MASK,
- PAGE_SIZE,
- blktap_vma->vm_page_prot,
- ID_TO_DOM(req->id));
- if ( error != 0 ) {
- printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
- /* the request is now dropped on the floor. */
- return 0;
+ unsigned long buf;
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+
+ buf = target->frame_and_sects[i] & PAGE_MASK;
+ uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
+ kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+
+ MULTI_update_va_mapping_otherdomain(
+ mcl+i,
+ kvaddr,
+ pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)),
+ 0,
+ ID_TO_DOM(req->id));
+
+ phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
+ FOREIGN_FRAME(buf >> PAGE_SHIFT);
+
+ __direct_remap_area_pages(blktap_vma->vm_mm,
+ uvaddr,
+ PAGE_SIZE,
+ &mmu[i]);
+ mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
+ | pgprot_val(blktap_vma->vm_page_prot);
+
+ offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+ ((struct page **)blktap_vma->vm_private_data)[offset] =
+ pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ }
+
+ /* Add the mmu_update call. */
+ mcl[i].op = __HYPERVISOR_mmu_update;
+ mcl[i].args[0] = (unsigned long)mmu;
+ mcl[i].args[1] = target->nr_segments;
+ mcl[i].args[2] = 0;
+ mcl[i].args[3] = ID_TO_DOM(req->id);
+
+ BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0);
+
+ /* Make sure it all worked. */
+ for ( i = 0; i < target->nr_segments; i++ )
+ {
+ if ( unlikely(mcl[i].result != 0) )
+ {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
+ return -1;
}
}
-
+ if ( unlikely(mcl[i].result != 0) )
+ {
+ DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
+ return -1;
+ }
+
+
+ /* Mark mapped pages as reserved: */
+ for ( i = 0; i < target->nr_segments; i++ )
+ {
+ unsigned long kvaddr;
+
+ kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+ SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
+ }
+
+
blktap_ufe_ring.req_prod_pvt++;
return 0;
@@ -366,7 +505,7 @@
{
/* This is called to read responses from the UFE ring. */
- RING_IDX i, rp;
+ RING_IDX i, j, rp;
blkif_response_t *resp_s;
blkif_t *blkif;
active_req_t *ar;
@@ -387,8 +526,23 @@
DPRINTK("resp->fe_ring\n");
ar = lookup_active_req(ID_TO_IDX(resp_s->id));
blkif = ar->blkif;
- zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0),
+ for (j = 0; j < ar->nr_pages; j++) {
+ unsigned long vaddr;
+ struct page **map = blktap_vma->vm_private_data;
+ int offset;
+
+ vaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j);
+ offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+
+ ClearPageReserved(virt_to_page(vaddr));
+ map[offset] = NULL;
+ }
+
+
+ zap_page_range(blktap_vma,
+ MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0),
ar->nr_pages << PAGE_SHIFT, NULL);
+ fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
write_resp_to_fe_ring(blkif, resp_s);
blktap_ufe_ring.rsp_cons = i + 1;
kick_fe_domain(blkif);
@@ -464,6 +618,9 @@
{
int err;
+ if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+ BUG();
+
err = misc_register(&blktap_miscdev);
if ( err != 0 )
{
diff -r 43f424818d6e -r 8004acaa6684 tools/blktap/blktaplib.c
--- a/tools/blktap/blktaplib.c Thu Aug 4 16:53:11 2005
+++ b/tools/blktap/blktaplib.c Thu Aug 4 16:53:30 2005
@@ -34,7 +34,7 @@
#else
#define DPRINTF(_f, _a...) ((void)0)
#endif
-#define DEBUG_RING_IDXS 0
+#define DEBUG_RING_IDXS 1
#define POLLRDNORM 0x040
diff -r 43f424818d6e -r 8004acaa6684 xen/include/public/io/blkif.h
--- a/xen/include/public/io/blkif.h Thu Aug 4 16:53:11 2005
+++ b/xen/include/public/io/blkif.h Thu Aug 4 16:53:30 2005
@@ -47,7 +47,7 @@
unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
} blkif_request_t;
-#define blkif_fas(_addr, _fs, _ls) ((addr)|((_fs)<<5)|(_ls))
+#define blkif_fas(_addr, _fs, _ls) ((_addr)|((_fs)<<5)|(_ls))
#define blkif_first_sect(_fas) (((_fas)>>5)&31)
#define blkif_last_sect(_fas) ((_fas)&31)
diff -r 43f424818d6e -r 8004acaa6684 tools/blktap/parallax/Makefile
--- /dev/null Thu Aug 4 16:53:11 2005
+++ b/tools/blktap/parallax/Makefile Thu Aug 4 16:53:30 2005
@@ -0,0 +1,64 @@
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+PARALLAX_INSTALL_DIR = /usr/sbin
+
+INSTALL = install
+INSTALL_PROG = $(INSTALL) -m0755
+INSTALL_DIR = $(INSTALL) -d -m0755
+
+INCLUDES += -I.. -I/usr/include -I $(XEN_LIBXC)
+
+LDFLAGS = -L.. -lpthread -lz -lblktap
+
+#PLX_SRCS :=
+PLX_SRCS := vdi.c
+PLX_SRCS += radix.c
+PLX_SRCS += snaplog.c
+PLX_SRCS += blockstore.c
+PLX_SRCS += block-async.c
+PLX_SRCS += requests-async.c
+VDI_SRCS := $(PLX_SRCS)
+PLX_SRCS += parallax.c
+
+#VDI_TOOLS :=
+VDI_TOOLS := vdi_create
+VDI_TOOLS += vdi_list
+VDI_TOOLS += vdi_snap
+VDI_TOOLS += vdi_snap_list
+VDI_TOOLS += vdi_snap_delete
+VDI_TOOLS += vdi_fill
+VDI_TOOLS += vdi_tree
+VDI_TOOLS += vdi_validate
+
+CFLAGS += -Wall
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+#CFLAGS += -O3
+CFLAGS += -g3
+CFLAGS += -fno-strict-aliasing
+CFLAGS += $(INCLUDES)
+CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+OBJS = $(patsubst %.c,%.o,$(SRCS))
+IBINS = parallax $(VDI_TOOLS)
+
+all: $(VDI_TOOLS) parallax blockstored
+
+install: all
+ $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(PARALLAX_INSTALL_DIR)
+
+clean:
+ rm -rf *.o *~ $(DEPS) xen TAGS $(VDI_TOOLS) parallax vdi_unittest
+
+parallax: $(PLX_SRCS)
+ $(CC) $(CFLAGS) -o parallax -L.. $(LDFLAGS) $(PLX_SRCS)
+
+${VDI_TOOLS}: %: %.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o $@ $@.c $(LDFLAGS) $(VDI_SRCS)
+
+.PHONY: TAGS clean install rpm
+-include $(DEPS)
\ No newline at end of file
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|