In this mode of operation, the receiving domain maps the sending
domain's buffers, rather than grant-copying them into local memory.
This is marginally faster, but requires the receiving domain to be
somewhat trusted, because:
a) It can see anything else which happens to be on the same page
as the transmit buffer, and
b) It can just hold onto the pages indefinitely, causing a memory leak
in the transmitting domain.
It's therefore only really suitable for talking to a trusted peer, and
we use it in that way.
Signed-off-by: Steven Smith <steven.smith@xxxxxxxxxx>
---
drivers/xen/netchannel2/Makefile | 3 +-
drivers/xen/netchannel2/chan.c | 14 +
drivers/xen/netchannel2/netchannel2_core.h | 17 +-
drivers/xen/netchannel2/receiver_map.c | 786 ++++++++++++++++++++++++++++
drivers/xen/netchannel2/recv_packet.c | 23 +
drivers/xen/netchannel2/rscb.c | 46 ++-
drivers/xen/netchannel2/util.c | 14 +
drivers/xen/netchannel2/xmit_packet.c | 12 +-
include/xen/interface/io/netchannel2.h | 20 +
9 files changed, 919 insertions(+), 16 deletions(-)
create mode 100644 drivers/xen/netchannel2/receiver_map.c
diff --git a/drivers/xen/netchannel2/Makefile b/drivers/xen/netchannel2/Makefile
index 565ba89..d6fb796 100644
--- a/drivers/xen/netchannel2/Makefile
+++ b/drivers/xen/netchannel2/Makefile
@@ -1,7 +1,8 @@
obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o
netchannel2-objs := chan.o netchan2.o rscb.o util.o \
- xmit_packet.o offload.o recv_packet.o poll.o
+ xmit_packet.o offload.o recv_packet.o poll.o \
+ receiver_map.o
ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y)
netchannel2-objs += netback2.o
diff --git a/drivers/xen/netchannel2/chan.c b/drivers/xen/netchannel2/chan.c
index 9bb7ce7..47e1c5e 100644
--- a/drivers/xen/netchannel2/chan.c
+++ b/drivers/xen/netchannel2/chan.c
@@ -395,6 +395,13 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd)
return NULL;
}
+ if (local_trusted) {
+ if (init_receive_map_mode() < 0) {
+ nc2_release(nc);
+ return NULL;
+ }
+ }
+
netdev->open = nc2_open;
netdev->stop = nc2_stop;
netdev->hard_start_xmit = nc2_start_xmit;
@@ -499,6 +506,8 @@ int nc2_attach_rings(struct netchannel2 *nc,
spin_unlock_bh(&nc->rings.lock);
+ resume_receive_map_mode();
+
netif_carrier_on(nc->net_device);
/* Kick it to get it going. */
@@ -630,6 +639,11 @@ int nc2_get_evtchn_port(struct netchannel2 *nc)
return nc->rings.evtchn;
}
+void nc2_suspend(struct netchannel2 *nc)
+{
+ suspend_receive_map_mode();
+}
+
/* @ncrp has been recently nc2_kick()ed. Do all of the necessary
stuff. */
static int process_ring(struct napi_struct *napi,
diff --git a/drivers/xen/netchannel2/netchannel2_core.h
b/drivers/xen/netchannel2/netchannel2_core.h
index 7be97ea..c4de063 100644
--- a/drivers/xen/netchannel2/netchannel2_core.h
+++ b/drivers/xen/netchannel2/netchannel2_core.h
@@ -37,6 +37,7 @@ enum transmit_policy {
transmit_policy_unknown = 0,
transmit_policy_first = 0xf001,
transmit_policy_grant = transmit_policy_first,
+ transmit_policy_map,
transmit_policy_small,
transmit_policy_last = transmit_policy_small
};
@@ -320,6 +321,11 @@ struct sk_buff *handle_receiver_copy_packet(struct
netchannel2 *nc,
struct netchannel2_msg_hdr *hdr,
unsigned nr_frags,
unsigned frags_off);
+struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc,
+ struct netchannel2_msg_packet *msg,
+ struct netchannel2_msg_hdr *hdr,
+ unsigned nr_frags,
+ unsigned frags_off);
enum prepare_xmit_result {
PREP_XMIT_OKAY = 0,
@@ -332,9 +338,11 @@ enum prepare_xmit_result prepare_xmit_allocate_small(
struct sk_buff *skb);
enum prepare_xmit_result prepare_xmit_allocate_grant(
struct netchannel2_ring_pair *ncrp,
- struct sk_buff *skb);
+ struct sk_buff *skb,
+ int use_subpage_grants);
void xmit_grant(struct netchannel2_ring_pair *ncrp,
struct sk_buff *skb,
+ int use_subpage_grants,
volatile void *msg);
void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp,
@@ -353,6 +361,8 @@ void fetch_fragment(struct netchannel2_ring_pair *ncrp,
struct netchannel2_fragment *frag,
unsigned off);
+void pull_through(struct sk_buff *skb, unsigned count);
+
void nc2_kick(struct netchannel2_ring_pair *ncrp);
int nc2_map_grants(struct grant_mapping *gm,
@@ -366,6 +376,11 @@ void queue_packet_to_interface(struct sk_buff *skb,
void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop);
+int init_receive_map_mode(void);
+void deinit_receive_map_mode(void);
+void suspend_receive_map_mode(void);
+void resume_receive_map_mode(void);
+
int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev);
int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp,
struct sk_buff *skb);
diff --git a/drivers/xen/netchannel2/receiver_map.c
b/drivers/xen/netchannel2/receiver_map.c
new file mode 100644
index 0000000..e5c4ed1
--- /dev/null
+++ b/drivers/xen/netchannel2/receiver_map.c
@@ -0,0 +1,786 @@
+/* Support for mapping packets into the local domain, rather than
+ copying them or using pre-posted buffers. We only implement
+ receive-side support here; for transmit-side, we use the rscb.c
+ implementation. */
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <xen/live_maps.h>
+#include <xen/gnttab.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include "netchannel2_core.h"
+
+#define MAX_MAPPED_FRAGS 1024
+#define MAX_MAPPED_PACKETS MAX_PENDING_FINISH_PACKETS
+#define SKB_MIN_PAYLOAD_SIZE 128
+
+static DEFINE_SPINLOCK(global_map_lock);
+static struct receive_mapper *receive_mapper;
+
+/* How long do we leave the packets in the Linux stack before trying
+ to copy them, in jiffies? */
+#define PACKET_TIMEOUT (HZ/2)
+
+/* A slot into which we could map a fragment. */
+struct rx_map_fragment {
+ struct list_head list;
+ struct rx_map_packet *packet;
+ grant_handle_t handle; /* 0 if the fragment isn't currently
+ * mapped */
+ struct netchannel2_fragment nc_frag;
+};
+
+struct rx_map_packet {
+ struct list_head list;
+ struct list_head frags;
+ /* We take a reference for every mapped fragment associated
+ with the packet. When the refcnt goes to zero, the packet
+ is finished, and can be moved to the
+ finished_packets_list. */
+ atomic_t refcnt;
+ unsigned id;
+ unsigned long expires; /* We expect Linux to have finished
+ with the packet by this time (in
+ jiffies), or we try to copy it. */
+ struct netchannel2 *nc;
+ uint8_t flags;
+};
+
+struct receive_mapper {
+ struct page_foreign_tracker *tracker;
+
+ struct page **pages;
+
+ /* Nests inside the netchannel2 lock. The
+ finished_packets_lock nests inside this. */
+ spinlock_t rm_lock;
+
+ /* Packet fragments which we've mapped, or slots into which we
+ could map packets. The free list and count are protected
+ by @rm_lock. */
+ struct rx_map_fragment frags[MAX_MAPPED_FRAGS];
+ struct list_head free_frags;
+
+ struct rx_map_packet packets[MAX_MAPPED_PACKETS];
+ struct list_head free_packets;
+ struct list_head active_packets;
+ unsigned nr_free_packets;
+
+ /* Packets which Linux has finished with but which we haven't
+ returned to the other endpoint yet. */
+ spinlock_t finished_packets_lock; /* BH-safe leaf lock,
+ * acquired from the page
+ * free callback. Nests
+ * inside the rm_lock. */
+ struct list_head finished_packets;
+
+ struct tasklet_struct gc_tasklet;
+
+ struct timer_list expire_timer;
+
+ /* Set if we're trying to run the mapper down prior to
+ suspending the domain. */
+ uint8_t suspending;
+};
+
+static void suspend_receive_mapper(struct receive_mapper *rm);
+
+static unsigned fragment_idx(const struct rx_map_fragment *frag)
+{
+ return frag - receive_mapper->frags;
+}
+
+static int alloc_rx_frags_for_packet(unsigned nr_frags,
+ struct rx_map_packet *packet)
+{
+ struct rx_map_fragment *rmf;
+ unsigned x;
+
+ INIT_LIST_HEAD(&packet->frags);
+ for (x = 0; x < nr_frags; x++) {
+ if (list_empty(&receive_mapper->free_frags))
+ goto err;
+ rmf = list_entry(receive_mapper->free_frags.next,
+ struct rx_map_fragment,
+ list);
+ rmf->packet = packet;
+ rmf->handle = -1;
+ list_move(&rmf->list, &packet->frags);
+ }
+ return 0;
+
+err:
+ list_splice_init(&packet->frags, &receive_mapper->free_frags);
+ return -EBUSY;
+}
+
+static struct rx_map_packet *alloc_rx_packet(struct netchannel2 *nc,
+ unsigned nr_frags)
+{
+ struct rx_map_packet *rmp;
+
+ spin_lock(&receive_mapper->rm_lock);
+ if (list_empty(&receive_mapper->free_packets) ||
+ receive_mapper->suspending) {
+ spin_unlock(&receive_mapper->rm_lock);
+ return NULL;
+ }
+ rmp = list_entry(receive_mapper->free_packets.next,
+ struct rx_map_packet, list);
+
+ if (alloc_rx_frags_for_packet(nr_frags, rmp) < 0) {
+ spin_unlock(&receive_mapper->rm_lock);
+ return NULL;
+ }
+ list_del(&rmp->list);
+ atomic_set(&rmp->refcnt, nr_frags);
+ rmp->nc = nc;
+ receive_mapper->nr_free_packets--;
+
+ spin_unlock(&receive_mapper->rm_lock);
+
+ return rmp;
+}
+
+struct grant_unmapper {
+ unsigned nr_gops;
+ gnttab_unmap_grant_ref_t gop_queue[32];
+};
+
+static void do_unmaps(struct grant_unmapper *unmapper)
+{
+ int ret;
+ unsigned x;
+
+ if (unmapper->nr_gops != 0) {
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmapper->gop_queue,
+ unmapper->nr_gops);
+ BUG_ON(ret);
+ for (x = 0; x < unmapper->nr_gops; x++) {
+ set_phys_to_machine(
+ __pa(unmapper->gop_queue[x].host_addr) >>
+ PAGE_SHIFT,
+ INVALID_P2M_ENTRY);
+ }
+ }
+ unmapper->nr_gops = 0;
+}
+
+static void grant_unmap(struct grant_unmapper *unmapper,
+ void *va,
+ int handle)
+{
+ gnttab_unmap_grant_ref_t *gop;
+ if (unmapper->nr_gops == ARRAY_SIZE(unmapper->gop_queue))
+ do_unmaps(unmapper);
+ gop = &unmapper->gop_queue[unmapper->nr_gops];
+ gnttab_set_unmap_op(gop, (unsigned long)va, GNTMAP_host_map, handle);
+ unmapper->nr_gops++;
+}
+
+/* A tasklet which is invoked shortly after a packet is released so
+ that we can send the FINISH_PACKET message. */
+static void gc_tasklet(unsigned long _rm)
+{
+ struct list_head packets;
+ struct rx_map_packet *packet;
+ struct rx_map_fragment *rx_frag;
+ struct list_head released_fragments;
+ unsigned nr_released_packets;
+ unsigned idx;
+ struct grant_unmapper unmapper;
+ struct page *page;
+ struct netchannel2 *locked_nc;
+
+ INIT_LIST_HEAD(&packets);
+
+ spin_lock(&receive_mapper->finished_packets_lock);
+ list_splice_init(&receive_mapper->finished_packets, &packets);
+ spin_unlock(&receive_mapper->finished_packets_lock);
+
+ /* Unmap the fragments. */
+ unmapper.nr_gops = 0;
+ BUG_ON(packets.next == NULL);
+ list_for_each_entry(packet, &packets, list) {
+ BUG_ON(packet->list.next == NULL);
+ BUG_ON(atomic_read(&packet->refcnt) != 0);
+ BUG_ON(packet->frags.next == NULL);
+ list_for_each_entry(rx_frag, &packet->frags, list) {
+ BUG_ON(rx_frag->list.next == NULL);
+ if (rx_frag->handle == -1)
+ continue;
+ idx = fragment_idx(rx_frag);
+ page = receive_mapper->pages[idx];
+ stop_tracking_page(page);
+ grant_unmap(&unmapper, page_address(page),
+ rx_frag->handle);
+ }
+ }
+ do_unmaps(&unmapper);
+
+ /* Tell the other end that the packets are finished, and
+ accumulate the fragments into a local free list. */
+ INIT_LIST_HEAD(&released_fragments);
+ nr_released_packets = 0;
+
+ locked_nc = NULL;
+ list_for_each_entry(packet, &packets, list) {
+ if (locked_nc != packet->nc) {
+ if (locked_nc) {
+ spin_unlock(&locked_nc->rings.lock);
+ nc2_kick(&locked_nc->rings);
+ }
+ spin_lock(&packet->nc->rings.lock);
+ locked_nc = packet->nc;
+ }
+ BUG_ON(packet->frags.next == NULL);
+ list_for_each_entry(rx_frag, &packet->frags, list) {
+ BUG_ON(rx_frag->list.next == NULL);
+ idx = fragment_idx(rx_frag);
+ gnttab_reset_grant_page(receive_mapper->pages[idx]);
+ }
+ nr_released_packets++;
+ list_splice_init(&packet->frags, &released_fragments);
+ queue_finish_packet_message(&locked_nc->rings, packet->id,
+ packet->flags);
+ }
+
+ if (locked_nc) {
+ spin_unlock(&locked_nc->rings.lock);
+ nc2_kick(&locked_nc->rings);
+ locked_nc = NULL;
+
+ spin_lock(&receive_mapper->rm_lock);
+ list_splice(&packets, &receive_mapper->free_packets);
+ list_splice(&released_fragments, &receive_mapper->free_frags);
+ receive_mapper->nr_free_packets += nr_released_packets;
+
+ /* Reprogram the expire timer. */
+ if (!list_empty(&receive_mapper->active_packets)) {
+ mod_timer(&receive_mapper->expire_timer,
+
list_entry(receive_mapper->active_packets.next,
+ struct rx_map_packet,
+ list)->expires);
+ }
+ spin_unlock(&receive_mapper->rm_lock);
+ }
+}
+
+/* Decrement the refcnt on @rmp and, if necessary, move it to the
+ finished packets list and schedule the GC tasklet. */
+static void put_rx_map_packet(struct rx_map_packet *rmp)
+{
+ if (atomic_dec_and_test(&rmp->refcnt)) {
+ /* Remove it from the active list. */
+ spin_lock_bh(&receive_mapper->rm_lock);
+ list_del(&rmp->list);
+ spin_unlock_bh(&receive_mapper->rm_lock);
+
+ /* Add it to the finished list. */
+ spin_lock_bh(&receive_mapper->finished_packets_lock);
+ list_add_tail(&rmp->list, &receive_mapper->finished_packets);
+ spin_unlock_bh(&receive_mapper->finished_packets_lock);
+
+ tasklet_schedule(&receive_mapper->gc_tasklet);
+ }
+}
+
+
+/* The page @page, which was previously part of a receiver-mapped SKB,
+ * has been released. If it was the last page involved in its SKB,
+ * the packet is finished and we can tell the other end that it's
+ * finished.
+ */
+static void netchan2_page_release(struct page *page, unsigned order)
+{
+ struct rx_map_fragment *frag;
+ struct rx_map_packet *rmp;
+
+ BUG_ON(order != 0);
+
+ frag = (struct rx_map_fragment *)page->mapping;
+ rmp = frag->packet;
+
+ put_rx_map_packet(rmp);
+}
+
+/* Unmap the packet, removing all other references to it. The caller
+ * should take an additional reference to the packet before calling
+ * this, to stop it disappearing underneath us. The only way of
+ * checking whether this succeeded is to look at the packet's
+ * reference count after it returns.
+ */
+static void unmap_this_packet(struct rx_map_packet *rmp)
+{
+ struct rx_map_fragment *rx_frag;
+ unsigned idx;
+ int r;
+ int cnt;
+
+ /* Unmap every fragment in the packet. We don't fail the whole
+ function just because gnttab_copy_grant_page() failed,
+ because success or failure will be inferable from the
+ reference count on the packet (this makes it easier to
+ handle the case where some pages have already been copied,
+ for instance). */
+ cnt = 0;
+ list_for_each_entry(rx_frag, &rmp->frags, list) {
+ idx = fragment_idx(rx_frag);
+ if (rx_frag->handle != -1) {
+ r = gnttab_copy_grant_page(rx_frag->handle,
+ &receive_mapper->pages[idx]);
+ if (r == 0) {
+ /* We copied the page, so it's not really
+ mapped any more. */
+ rx_frag->handle = -1;
+ atomic_dec(&rmp->refcnt);
+ }
+ }
+ cnt++;
+ }
+
+ /* Caller should hold a reference. */
+ BUG_ON(atomic_read(&rmp->refcnt) == 0);
+}
+
+static void unmap_all_packets(void)
+{
+ struct rx_map_packet *rmp;
+ struct rx_map_packet *next;
+ struct list_head finished_packets;
+ int need_tasklet;
+
+ INIT_LIST_HEAD(&finished_packets);
+
+ spin_lock_bh(&receive_mapper->rm_lock);
+
+ list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets,
+ list) {
+ atomic_inc(&rmp->refcnt);
+ unmap_this_packet(rmp);
+ if (atomic_dec_and_test(&rmp->refcnt))
+ list_move(&rmp->list, finished_packets.prev);
+ }
+ spin_unlock_bh(&receive_mapper->rm_lock);
+
+ need_tasklet = !list_empty(&finished_packets);
+
+ spin_lock_bh(&receive_mapper->finished_packets_lock);
+ list_splice(&finished_packets, receive_mapper->finished_packets.prev);
+ spin_unlock_bh(&receive_mapper->finished_packets_lock);
+
+ if (need_tasklet)
+ tasklet_schedule(&receive_mapper->gc_tasklet);
+}
+
+static void free_receive_mapper(struct receive_mapper *rm)
+{
+ unsigned x;
+
+ /* Get rid of any packets which are currently mapped. */
+ suspend_receive_mapper(rm);
+
+ /* Stop the expiry timer. We know it won't get requeued
+ * because there are no packets outstanding and rm->suspending
+ * is set (because of suspend_receive_mapper()). */
+ del_timer_sync(&rm->expire_timer);
+
+ /* Wait for any last instances of the tasklet to finish. */
+ tasklet_kill(&rm->gc_tasklet);
+
+ if (rm->pages != NULL) {
+ for (x = 0; x < MAX_MAPPED_FRAGS; x++) {
+ if (PageForeign(rm->pages[x]))
+ ClearPageForeign(rm->pages[x]);
+ rm->pages[x]->mapping = NULL;
+ }
+ free_empty_pages_and_pagevec(rm->pages, MAX_MAPPED_FRAGS);
+ }
+ if (rm->tracker != NULL)
+ free_page_foreign_tracker(rm->tracker);
+ kfree(rm);
+}
+
+/* Timer invoked shortly after a packet expires, so that we can copy
+ the data and get it back from Linux. This is necessary if a packet
+ gets stuck in a socket RX queue somewhere, or you risk a
+ deadlock. */
+static void expire_timer(unsigned long data)
+{
+ struct rx_map_packet *rmp, *next;
+ struct list_head finished_packets;
+ int need_tasklet;
+
+ INIT_LIST_HEAD(&finished_packets);
+
+ spin_lock(&receive_mapper->rm_lock);
+ list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets,
+ list) {
+ if (time_after(rmp->expires, jiffies)) {
+ mod_timer(&receive_mapper->expire_timer, rmp->expires);
+ break;
+ }
+ atomic_inc(&rmp->refcnt);
+ unmap_this_packet(rmp);
+ if (atomic_dec_and_test(&rmp->refcnt)) {
+ list_move(&rmp->list, finished_packets.prev);
+ } else {
+ /* Couldn't unmap the packet, either because
+ it's in use by real hardware or we've run
+ out of memory. Send the packet to the end
+ of the queue and update the expiry time so
+ that we try again later. */
+ /* Note that this can make the active packet
+ list slightly out of order. Oh well; it
+ won't be by more than a few jiffies, and it
+ doesn't really matter that much. */
+ rmp->expires = jiffies + PACKET_TIMEOUT;
+ list_move(&rmp->list,
+ receive_mapper->active_packets.prev);
+ }
+ }
+ spin_unlock(&receive_mapper->rm_lock);
+
+ need_tasklet = !list_empty(&finished_packets);
+
+ spin_lock(&receive_mapper->finished_packets_lock);
+ list_splice(&finished_packets, receive_mapper->finished_packets.prev);
+ spin_unlock(&receive_mapper->finished_packets_lock);
+
+ if (need_tasklet)
+ tasklet_schedule(&receive_mapper->gc_tasklet);
+}
+
+static struct receive_mapper *new_receive_mapper(void)
+{
+ struct receive_mapper *rm;
+ unsigned x;
+
+ rm = kzalloc(sizeof(*rm), GFP_KERNEL);
+ if (!rm)
+ goto err;
+ INIT_LIST_HEAD(&rm->free_frags);
+ INIT_LIST_HEAD(&rm->free_packets);
+ INIT_LIST_HEAD(&rm->active_packets);
+ INIT_LIST_HEAD(&rm->finished_packets);
+ spin_lock_init(&rm->rm_lock);
+ spin_lock_init(&rm->finished_packets_lock);
+ for (x = 0; x < MAX_MAPPED_FRAGS; x++)
+ list_add_tail(&rm->frags[x].list, &rm->free_frags);
+ for (x = 0; x < MAX_MAPPED_PACKETS; x++)
+ list_add_tail(&rm->packets[x].list, &rm->free_packets);
+ rm->nr_free_packets = MAX_MAPPED_PACKETS;
+
+ setup_timer(&rm->expire_timer, expire_timer, 0);
+ tasklet_init(&rm->gc_tasklet, gc_tasklet, 0);
+
+ rm->tracker = alloc_page_foreign_tracker(MAX_MAPPED_FRAGS);
+ if (!rm->tracker)
+ goto err;
+ rm->pages = alloc_empty_pages_and_pagevec(MAX_MAPPED_FRAGS);
+ if (!rm->pages)
+ goto err;
+ for (x = 0; x < MAX_MAPPED_FRAGS; x++) {
+ SetPageForeign(rm->pages[x], netchan2_page_release);
+ rm->pages[x]->mapping = (void *)&rm->frags[x];
+ }
+
+ return rm;
+
+err:
+ if (rm != NULL)
+ free_receive_mapper(rm);
+ return NULL;
+}
+
+static void attach_frag_to_skb(struct sk_buff *skb,
+ struct rx_map_fragment *frag)
+{
+ unsigned idx;
+ struct skb_shared_info *shinfo;
+ skb_frag_t *sk_frag;
+
+ shinfo = skb_shinfo(skb);
+ sk_frag = &shinfo->frags[shinfo->nr_frags];
+ idx = fragment_idx(frag);
+ sk_frag->page = receive_mapper->pages[idx];
+ sk_frag->page_offset = frag->nc_frag.off;
+ sk_frag->size = frag->nc_frag.size;
+ shinfo->nr_frags++;
+}
+
+struct rx_plan {
+ int is_failed;
+ unsigned nr_mops;
+ gnttab_map_grant_ref_t mops[8];
+ struct rx_map_fragment *frags[8];
+};
+
+static void flush_grant_operations(struct rx_plan *rp)
+{
+ unsigned x;
+ int ret;
+ gnttab_map_grant_ref_t *mop;
+
+ if (rp->nr_mops == 0)
+ return;
+ if (!rp->is_failed) {
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ rp->mops,
+ rp->nr_mops);
+ BUG_ON(ret);
+ for (x = 0; x < rp->nr_mops; x++) {
+ mop = &rp->mops[x];
+ if (mop->status != 0) {
+ rp->is_failed = 1;
+ } else {
+ rp->frags[x]->handle = mop->handle;
+ set_phys_to_machine(
+ __pa(mop->host_addr) >> PAGE_SHIFT,
+ FOREIGN_FRAME(mop->dev_bus_addr >>
+ PAGE_SHIFT));
+ }
+ }
+ }
+ rp->nr_mops = 0;
+}
+
+static void map_fragment(struct rx_plan *rp,
+ struct rx_map_fragment *rx_frag,
+ struct netchannel2 *nc)
+{
+ unsigned idx = fragment_idx(rx_frag);
+ gnttab_map_grant_ref_t *mop;
+
+ if (rp->nr_mops == ARRAY_SIZE(rp->mops))
+ flush_grant_operations(rp);
+ mop = &rp->mops[rp->nr_mops];
+ gnttab_set_map_op(mop,
+ (unsigned
long)page_address(receive_mapper->pages[idx]),
+ GNTMAP_host_map | GNTMAP_readonly,
+ rx_frag->nc_frag.receiver_map.gref,
+ nc->rings.otherend_id);
+ rp->frags[rp->nr_mops] = rx_frag;
+ rp->nr_mops++;
+}
+
+/* Unmap a packet which has been half-mapped. */
+static void unmap_partial_packet(struct rx_map_packet *rmp)
+{
+ unsigned idx;
+ struct rx_map_fragment *rx_frag;
+ struct grant_unmapper unmapper;
+
+ unmapper.nr_gops = 0;
+ list_for_each_entry(rx_frag, &rmp->frags, list) {
+ if (rx_frag->handle == -1)
+ continue;
+ idx = fragment_idx(rx_frag);
+ grant_unmap(&unmapper,
+ page_address(receive_mapper->pages[idx]),
+ rx_frag->handle);
+ }
+ do_unmaps(&unmapper);
+}
+
+struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc,
+ struct netchannel2_msg_packet *msg,
+ struct netchannel2_msg_hdr *hdr,
+ unsigned nr_frags,
+ unsigned frags_off)
+{
+ struct sk_buff *skb;
+ struct rx_map_fragment *rx_frag;
+ unsigned x;
+ unsigned len;
+ struct rx_map_packet *rmp;
+ unsigned idx;
+ struct rx_plan plan;
+ unsigned prefix_size;
+
+ memset(&plan, 0, sizeof(plan));
+
+ rmp = alloc_rx_packet(nc, nr_frags);
+ if (rmp == NULL)
+ return NULL;
+
+ if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE)
+ prefix_size = SKB_MIN_PAYLOAD_SIZE;
+ else
+ prefix_size = msg->prefix_size;
+ /* As in posted_buffers.c, we don't limit the total size of
+ the packet, because we don't need to allocate more memory
+ for very large packets. The prefix is safe because it's
+ only a 16 bit number. A 64k allocation won't always
+ succeed, but it's unlikely to trigger the OOM killer or
+ otherwise interfere with the normal operation of the local
+ domain. */
+ skb = dev_alloc_skb(prefix_size + NET_IP_ALIGN);
+ if (skb == NULL) {
+ spin_lock(&receive_mapper->rm_lock);
+ list_splice(&rmp->frags, &receive_mapper->free_frags);
+ list_add(&rmp->list, &receive_mapper->free_packets);
+ receive_mapper->nr_free_packets++;
+ spin_unlock(&receive_mapper->rm_lock);
+ return NULL;
+ }
+ skb_reserve(skb, NET_IP_ALIGN);
+
+ rmp->id = msg->id;
+ rmp->flags = msg->flags;
+
+ rx_frag = list_entry(rmp->frags.next, struct rx_map_fragment, list);
+ for (x = 0; x < nr_frags; x++) {
+ fetch_fragment(&nc->rings, x, &rx_frag->nc_frag, frags_off);
+ if (rx_frag->nc_frag.size > PAGE_SIZE ||
+ rx_frag->nc_frag.off >= PAGE_SIZE ||
+ rx_frag->nc_frag.size + rx_frag->nc_frag.off > PAGE_SIZE) {
+ plan.is_failed = 1;
+ break;
+ }
+ map_fragment(&plan, rx_frag, nc);
+ rx_frag = list_entry(rx_frag->list.next,
+ struct rx_map_fragment,
+ list);
+ }
+
+ flush_grant_operations(&plan);
+ if (plan.is_failed)
+ goto fail_and_unmap;
+
+ /* Grab the prefix off of the ring. */
+ nc2_copy_from_ring_off(&nc->rings.cons_ring,
+ skb_put(skb, msg->prefix_size),
+ msg->prefix_size,
+ frags_off +
+ nr_frags * sizeof(struct netchannel2_fragment));
+
+ /* All fragments mapped, so we know that this is going to
+ work. Transfer the receive slots into the SKB. */
+ len = 0;
+ list_for_each_entry(rx_frag, &rmp->frags, list) {
+ attach_frag_to_skb(skb, rx_frag);
+ idx = fragment_idx(rx_frag);
+ start_tracking_page(receive_mapper->tracker,
+ receive_mapper->pages[idx],
+ nc->rings.otherend_id,
+ rx_frag->nc_frag.receiver_map.gref,
+ idx,
+ nc);
+ len += rx_frag->nc_frag.size;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+
+ spin_lock(&receive_mapper->rm_lock);
+ list_add_tail(&rmp->list, &receive_mapper->active_packets);
+ rmp->expires = jiffies + PACKET_TIMEOUT;
+ if (rmp == list_entry(receive_mapper->active_packets.next,
+ struct rx_map_packet,
+ list))
+ mod_timer(&receive_mapper->expire_timer, rmp->expires);
+ spin_unlock(&receive_mapper->rm_lock);
+
+ if (skb_headlen(skb) < SKB_MIN_PAYLOAD_SIZE)
+ pull_through(skb,
+ SKB_MIN_PAYLOAD_SIZE - skb_headlen(skb));
+
+ return skb;
+
+fail_and_unmap:
+ pr_debug("Failed to map received packet!\n");
+ unmap_partial_packet(rmp);
+
+ spin_lock(&receive_mapper->rm_lock);
+ list_splice(&rmp->frags, &receive_mapper->free_frags);
+ list_add_tail(&rmp->list, &receive_mapper->free_packets);
+ receive_mapper->nr_free_packets++;
+ spin_unlock(&receive_mapper->rm_lock);
+
+ kfree_skb(skb);
+ return NULL;
+}
+
+static void suspend_receive_mapper(struct receive_mapper *rm)
+{
+ spin_lock_bh(&rm->rm_lock);
+ /* Stop any more packets coming in. */
+ rm->suspending = 1;
+
+ /* Wait for Linux to give back all of the SKBs which we've
+ given it. */
+ while (rm->nr_free_packets != MAX_MAPPED_PACKETS) {
+ spin_unlock_bh(&rm->rm_lock);
+ unmap_all_packets();
+ msleep(100);
+ spin_lock_bh(&rm->rm_lock);
+ }
+ spin_unlock_bh(&rm->rm_lock);
+}
+
+static void resume_receive_mapper(void)
+{
+ spin_lock_bh(&receive_mapper->rm_lock);
+ receive_mapper->suspending = 0;
+ spin_unlock_bh(&receive_mapper->rm_lock);
+}
+
+
+int init_receive_map_mode(void)
+{
+ struct receive_mapper *new_rm;
+ spin_lock(&global_map_lock);
+ while (receive_mapper == NULL) {
+ spin_unlock(&global_map_lock);
+ new_rm = new_receive_mapper();
+ if (new_rm == NULL)
+ return -ENOMEM;
+ spin_lock(&global_map_lock);
+ if (receive_mapper == NULL) {
+ receive_mapper = new_rm;
+ } else {
+ spin_unlock(&global_map_lock);
+ free_receive_mapper(new_rm);
+ spin_lock(&global_map_lock);
+ }
+ }
+ spin_unlock(&global_map_lock);
+ return 0;
+}
+
+void deinit_receive_map_mode(void)
+{
+ if (!receive_mapper)
+ return;
+ BUG_ON(spin_is_locked(&global_map_lock));
+ free_receive_mapper(receive_mapper);
+ receive_mapper = NULL;
+}
+
+void suspend_receive_map_mode(void)
+{
+ if (!receive_mapper)
+ return;
+ suspend_receive_mapper(receive_mapper);
+}
+
+void resume_receive_map_mode(void)
+{
+ if (!receive_mapper)
+ return;
+ resume_receive_mapper();
+}
+
+struct netchannel2 *nc2_get_interface_for_page(struct page *p)
+{
+ BUG_ON(!page_is_tracked(p));
+ if (!receive_mapper ||
+ tracker_for_page(p) != receive_mapper->tracker)
+ return NULL;
+ return get_page_tracker_ctxt(p);
+}
diff --git a/drivers/xen/netchannel2/recv_packet.c
b/drivers/xen/netchannel2/recv_packet.c
index 80c5d5d..8c38788 100644
--- a/drivers/xen/netchannel2/recv_packet.c
+++ b/drivers/xen/netchannel2/recv_packet.c
@@ -112,6 +112,28 @@ void nc2_handle_packet_msg(struct netchannel2 *nc,
nr_frags, frags_off);
queue_finish_packet_message(ncrp, msg.id, msg.flags);
break;
+ case NC2_PACKET_TYPE_receiver_map:
+ if (!nc->local_trusted) {
+ /* The remote doesn't trust us, so they
+ shouldn't be sending us receiver-map
+ packets. Just treat it as an RSCB
+ packet. */
+ skb = NULL;
+ } else {
+ skb = handle_receiver_map_packet(nc, &msg, hdr,
+ nr_frags,
+ frags_off);
+ /* Finish message will be sent when we unmap
+ * the packet. */
+ }
+ if (skb == NULL) {
+ /* We can't currently map this skb. Use a
+ receiver copy instead. */
+ skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr,
+ nr_frags, frags_off);
+ queue_finish_packet_message(ncrp, msg.id, msg.flags);
+ }
+ break;
default:
pr_debug("Unknown packet type %d\n", msg.type);
nc->stats.rx_errors++;
@@ -285,4 +307,5 @@ int __init nc2_init(void)
void __exit nc2_exit(void)
{
+ deinit_receive_map_mode();
}
diff --git a/drivers/xen/netchannel2/rscb.c b/drivers/xen/netchannel2/rscb.c
index 8ad5454..cdcb116 100644
--- a/drivers/xen/netchannel2/rscb.c
+++ b/drivers/xen/netchannel2/rscb.c
@@ -209,6 +209,7 @@ struct sk_buff *handle_receiver_copy_packet(struct
netchannel2 *nc,
struct grant_packet_plan {
volatile struct netchannel2_fragment *out_fragment;
grant_ref_t gref_pool;
+ int use_subpage_grants;
unsigned prefix_avail;
};
@@ -223,14 +224,15 @@ static inline int nfrags_skb(struct sk_buff *skb, int
prefix_size)
start_grant = ((unsigned long)skb->data + prefix_size) &
~(PAGE_SIZE-1);
end_grant = ((unsigned long)skb->data +
- skb_headlen(skb) + PAGE_SIZE - 1) &
+ skb_headlen(skb) + PAGE_SIZE - 1) &
~(PAGE_SIZE-1);
return ((end_grant - start_grant) >> PAGE_SHIFT)
+ skb_shinfo(skb)->nr_frags;
}
enum prepare_xmit_result prepare_xmit_allocate_grant(struct
netchannel2_ring_pair *ncrp,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ int use_subpage_grants)
{
struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
unsigned nr_fragments;
@@ -241,13 +243,23 @@ enum prepare_xmit_result
prepare_xmit_allocate_grant(struct netchannel2_ring_pai
if (allocate_txp_slot(ncrp, skb) < 0)
return PREP_XMIT_BUSY;
- /* We're going to have to get the remote to issue a grant copy
- hypercall anyway, so there's no real benefit to shoving the
- headers inline. */
- /* (very small packets won't go through here, so there's no
- chance that we could completely eliminate the grant
- copy.) */
- inline_prefix_size = sizeof(struct ethhdr);
+ if (use_subpage_grants) {
+ /* We're going to have to get the remote to issue a
+ grant copy hypercall anyway, so there's no real
+ benefit to shoving the headers inline. */
+ /* (very small packets won't go through here, so
+ there's no chance that we could completely
+ eliminate the grant copy.) */
+ inline_prefix_size = sizeof(struct ethhdr);
+ } else {
+ /* If we're going off-box (and we probably are, if the
+ remote is trusted), putting the header in the ring
+ potentially saves a TLB miss in the bridge, which
+ is worth doing. */
+ inline_prefix_size = PACKET_PREFIX_SIZE;
+ if (skb_headlen(skb) < inline_prefix_size)
+ inline_prefix_size = skb_headlen(skb);
+ }
if (skb_co->nr_fragments == 0) {
nr_fragments = nfrags_skb(skb, inline_prefix_size);
@@ -277,10 +289,14 @@ enum prepare_xmit_result
prepare_xmit_allocate_grant(struct netchannel2_ring_pai
have to recompute it next time around. */
return PREP_XMIT_BUSY;
}
+
skb_co->gref_pool = gref_pool;
skb_co->inline_prefix_size = inline_prefix_size;
- skb_co->type = NC2_PACKET_TYPE_receiver_copy;
+ if (use_subpage_grants)
+ skb_co->type = NC2_PACKET_TYPE_receiver_copy;
+ else
+ skb_co->type = NC2_PACKET_TYPE_receiver_map;
return PREP_XMIT_OKAY;
}
@@ -318,15 +334,19 @@ static void prepare_subpage_grant(struct
netchannel2_ring_pair *ncrp,
GTF_readonly,
trans_domid,
trans_gref);
- } else {
+ } else if (plan->use_subpage_grants) {
gnttab_grant_foreign_access_ref_subpage(gref,
ncrp->otherend_id,
virt_to_mfn(page_address(page)),
GTF_readonly,
off_in_page,
size);
+ } else {
+ gnttab_grant_foreign_access_ref(gref,
+ ncrp->otherend_id,
+ virt_to_mfn(page_address(page)),
+ GTF_readonly);
}
-
frag->off = off_in_page;
frag->size = size;
plan->out_fragment++;
@@ -356,6 +376,7 @@ static int grant_data_area(struct netchannel2_ring_pair
*ncrp,
void xmit_grant(struct netchannel2_ring_pair *ncrp,
struct sk_buff *skb,
+ int use_subpage_grants,
volatile void *msg_buf)
{
volatile struct netchannel2_msg_packet *msg = msg_buf;
@@ -366,6 +387,7 @@ void xmit_grant(struct netchannel2_ring_pair *ncrp,
skb_frag_t *frag;
memset(&plan, 0, sizeof(plan));
+ plan.use_subpage_grants = use_subpage_grants;
plan.prefix_avail = skb_co->inline_prefix_size;
plan.out_fragment = msg->frags;
plan.gref_pool = skb_co->gref_pool;
diff --git a/drivers/xen/netchannel2/util.c b/drivers/xen/netchannel2/util.c
index 302dfc1..79d9f09 100644
--- a/drivers/xen/netchannel2/util.c
+++ b/drivers/xen/netchannel2/util.c
@@ -94,6 +94,20 @@ void release_tx_packet(struct netchannel2_ring_pair *ncrp,
}
gnttab_release_grant_reference(&ncrp->gref_pool, gref);
}
+ } else if (skb_co->type == NC2_PACKET_TYPE_receiver_map) {
+ while (1) {
+ r = gnttab_claim_grant_reference(&skb_co->gref_pool);
+ if (r == -ENOSPC)
+ break;
+ gref = (grant_ref_t)r;
+ r = gnttab_end_foreign_access_ref(gref);
+ if (r == 0) {
+ printk(KERN_WARNING "Failed to end remote
access to packet memory.\n");
+ } else {
+ gnttab_release_grant_reference(&ncrp->gref_pool,
+ gref);
+ }
+ }
} else if (skb_co->gref_pool != 0) {
gnttab_subfree_grant_references(skb_co->gref_pool,
&ncrp->gref_pool);
diff --git a/drivers/xen/netchannel2/xmit_packet.c
b/drivers/xen/netchannel2/xmit_packet.c
index 7eb845d..d95ad09 100644
--- a/drivers/xen/netchannel2/xmit_packet.c
+++ b/drivers/xen/netchannel2/xmit_packet.c
@@ -13,6 +13,8 @@ static enum transmit_policy transmit_policy(struct
netchannel2 *nc,
{
if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb))
return transmit_policy_small;
+ else if (nc->remote_trusted)
+ return transmit_policy_map;
else
return transmit_policy_grant;
}
@@ -72,7 +74,10 @@ enum prepare_xmit_result
prepare_xmit_allocate_resources(struct netchannel2 *nc,
r = prepare_xmit_allocate_small(&nc->rings, skb);
break;
case transmit_policy_grant:
- r = prepare_xmit_allocate_grant(&nc->rings, skb);
+ r = prepare_xmit_allocate_grant(&nc->rings, skb, 1);
+ break;
+ case transmit_policy_map:
+ r = prepare_xmit_allocate_grant(&nc->rings, skb, 0);
break;
default:
BUG();
@@ -170,7 +175,10 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair
*ncrp,
/* Nothing to do */
break;
case transmit_policy_grant:
- xmit_grant(ncrp, skb, msg);
+ xmit_grant(ncrp, skb, 1, msg);
+ break;
+ case transmit_policy_map:
+ xmit_grant(ncrp, skb, 0, msg);
break;
default:
BUG();
diff --git a/include/xen/interface/io/netchannel2.h
b/include/xen/interface/io/netchannel2.h
index 1cca607..f264995 100644
--- a/include/xen/interface/io/netchannel2.h
+++ b/include/xen/interface/io/netchannel2.h
@@ -46,6 +46,9 @@ struct netchannel2_fragment {
struct {
grant_ref_t gref;
} receiver_copy;
+ struct {
+ grant_ref_t gref;
+ } receiver_map;
};
};
struct netchannel2_msg_packet {
@@ -98,6 +101,22 @@ struct netchannel2_msg_packet {
* Due to backend bugs, it is in not safe to use this
* packet type except on bypass rings.
*
+ * receiver_map -- The transmitting domain has granted the receiving
+ * domain access to the original RX buffers using
+ * full (mappable) grant references. This can be
+ * treated the same way as receiver_copy, but the
+ * receiving domain also has the option of mapping
+ * the fragments, rather than copying them. If it
+ * decides to do so, it should ensure that the fragments
+ * will be unmapped in a reasonably timely fashion,
+ * and don't e.g. become stuck in a receive buffer
+ * somewhere. In general, anything longer than about
+ * a second is likely to cause problems. Once all
+ * grant references have been unmapper, the receiving
+ * domain should send a FINISH message.
+ *
+ * This packet type may not be used on bypass rings.
+ *
* small -- The packet does not have any fragment descriptors
* (i.e. the entire thing is inline in the ring). The receiving
* domain should simply the copy the packet out of the ring
@@ -110,6 +129,7 @@ struct netchannel2_msg_packet {
* that it is correct to treat receiver_map and small packets as
* receiver_copy ones. */
#define NC2_PACKET_TYPE_receiver_copy 1
+#define NC2_PACKET_TYPE_receiver_map 3
#define NC2_PACKET_TYPE_small 4
#define NC2_PACKET_SEGMENTATION_TYPE_none 0
--
1.6.3.1
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|