WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-ppc-devel

[XenPPC] [linux-ppc-2.6] [LINUX][XEN] backport net drivers

To: xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
Subject: [XenPPC] [linux-ppc-2.6] [LINUX][XEN] backport net drivers
From: Xen patchbot-linux-ppc-2.6 <patchbot-linux-ppc-2.6@xxxxxxxxxxxxxxxxxxx>
Date: Sun, 08 Oct 2006 18:03:33 +0000
Delivery-date: Sun, 08 Oct 2006 11:32:10 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-ppc-devel-request@lists.xensource.com?subject=help>
List-id: Xen PPC development <xen-ppc-devel.lists.xensource.com>
List-post: <mailto:xen-ppc-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ppc-devel>, <mailto:xen-ppc-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ppc-devel>, <mailto:xen-ppc-devel-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-ppc-devel-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Jimi Xenidis <jimix@xxxxxxxxxxxxxx>
# Node ID 11ee20d418ea813709da4c86dbc4ae28efb17f36
# Parent  fe1a31c06cbe548d3eb59b4e0fc16fc102a16344
[LINUX][XEN] backport net drivers

This is simply a copy of the network files from the Sparse tree that
have not made it to our tree yet.

Signed-off-by: Jimi Xenidis <jimix@xxxxxxxxxxxxxx>
---
 drivers/xen/netback/common.h    |   25 
 drivers/xen/netback/interface.c |   85 +-
 drivers/xen/netback/loopback.c  |    8 
 drivers/xen/netback/netback.c   |  834 ++++++++++++++++++++------
 drivers/xen/netback/xenbus.c    |   87 ++
 drivers/xen/netfront/netfront.c | 1242 ++++++++++++++++++++++++++++------------
 6 files changed, 1696 insertions(+), 585 deletions(-)

diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netback/common.h
--- a/drivers/xen/netback/common.h      Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netback/common.h      Sun Oct 08 12:28:37 2006 -0400
@@ -64,9 +64,9 @@ typedef struct netif_st {
 
        /* Physical parameters of the comms window. */
        grant_handle_t   tx_shmem_handle;
-       grant_ref_t      tx_shmem_ref; 
+       grant_ref_t      tx_shmem_ref;
        grant_handle_t   rx_shmem_handle;
-       grant_ref_t      rx_shmem_ref; 
+       grant_ref_t      rx_shmem_ref;
        unsigned int     evtchn;
        unsigned int     irq;
 
@@ -75,6 +75,13 @@ typedef struct netif_st {
        netif_rx_back_ring_t rx;
        struct vm_struct *tx_comms_area;
        struct vm_struct *rx_comms_area;
+
+       /* Set of features that can be turned on in dev->features. */
+       int features;
+
+       /* Internal feature information. */
+       int can_queue:1;        /* can queue packets for receiver? */
+       int copying_receiver:1; /* copy packets to receiver?       */
 
        /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
        RING_IDX rx_req_cons_peek;
@@ -86,8 +93,6 @@ typedef struct netif_st {
        struct timer_list credit_timeout;
 
        /* Miscellaneous private stuff. */
-       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
-       int active;
        struct list_head list;  /* scheduling list */
        atomic_t         refcnt;
        struct net_device *dev;
@@ -121,4 +126,16 @@ struct net_device_stats *netif_be_get_st
 struct net_device_stats *netif_be_get_stats(struct net_device *dev);
 irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
 
+static inline int netbk_can_queue(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       return netif->can_queue;
+}
+
+static inline int netbk_can_sg(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       return netif->features & NETIF_F_SG;
+}
+
 #endif /* __NETIF__BACKEND__COMMON_H__ */
diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netback/interface.c
--- a/drivers/xen/netback/interface.c   Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netback/interface.c   Sun Oct 08 12:28:37 2006 -0400
@@ -36,46 +36,75 @@
 
 static void __netif_up(netif_t *netif)
 {
-       struct net_device *dev = netif->dev;
-       netif_tx_lock_bh(dev);
-       netif->active = 1;
-       netif_tx_unlock_bh(dev);
        enable_irq(netif->irq);
        netif_schedule_work(netif);
 }
 
 static void __netif_down(netif_t *netif)
 {
-       struct net_device *dev = netif->dev;
        disable_irq(netif->irq);
-       netif_tx_lock_bh(dev);
-       netif->active = 0;
-       netif_tx_unlock_bh(dev);
        netif_deschedule_work(netif);
 }
 
 static int net_open(struct net_device *dev)
 {
        netif_t *netif = netdev_priv(dev);
-       if (netif->status == CONNECTED)
+       if (netif_carrier_ok(dev))
                __netif_up(netif);
-       netif_start_queue(dev);
        return 0;
 }
 
 static int net_close(struct net_device *dev)
 {
        netif_t *netif = netdev_priv(dev);
-       netif_stop_queue(dev);
-       if (netif->status == CONNECTED)
+       if (netif_carrier_ok(dev))
                __netif_down(netif);
        return 0;
+}
+
+static int netbk_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static int netbk_set_sg(struct net_device *dev, u32 data)
+{
+       if (data) {
+               netif_t *netif = netdev_priv(dev);
+
+               if (!(netif->features & NETIF_F_SG))
+                       return -ENOSYS;
+       }
+
+       return ethtool_op_set_sg(dev, data);
+}
+
+static int netbk_set_tso(struct net_device *dev, u32 data)
+{
+       if (data) {
+               netif_t *netif = netdev_priv(dev);
+
+               if (!(netif->features & NETIF_F_TSO))
+                       return -ENOSYS;
+       }
+
+       return ethtool_op_set_tso(dev, data);
 }
 
 static struct ethtool_ops network_ethtool_ops =
 {
        .get_tx_csum = ethtool_op_get_tx_csum,
        .set_tx_csum = ethtool_op_set_tx_csum,
+       .get_sg = ethtool_op_get_sg,
+       .set_sg = netbk_set_sg,
+       .get_tso = ethtool_op_get_tso,
+       .set_tso = netbk_set_tso,
+       .get_link = ethtool_op_get_link,
 };
 
 netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN])
@@ -92,11 +121,12 @@ netif_t *netif_alloc(domid_t domid, unsi
                return ERR_PTR(-ENOMEM);
        }
 
+       netif_carrier_off(dev);
+
        netif = netdev_priv(dev);
        memset(netif, 0, sizeof(*netif));
        netif->domid  = domid;
        netif->handle = handle;
-       netif->status = DISCONNECTED;
        atomic_set(&netif->refcnt, 1);
        init_waitqueue_head(&netif->waiting_to_free);
        netif->dev = dev;
@@ -109,12 +139,16 @@ netif_t *netif_alloc(domid_t domid, unsi
        dev->get_stats       = netif_be_get_stats;
        dev->open            = net_open;
        dev->stop            = net_close;
+       dev->change_mtu      = netbk_change_mtu;
        dev->features        = NETIF_F_IP_CSUM;
 
        SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
 
-       /* Disable queuing. */
-       dev->tx_queue_len = 0;
+       /*
+        * Reduce default TX queuelen so that each guest interface only
+        * allows it to eat around 6.4MB of host memory.
+        */
+       dev->tx_queue_len = 100;
 
        for (i = 0; i < ETH_ALEN; i++)
                if (be_mac[i] != 0)
@@ -255,11 +289,9 @@ int netif_map(netif_t *netif, unsigned l
        netif->rx_req_cons_peek = 0;
 
        netif_get(netif);
-       wmb(); /* Other CPUs see new state before interface is started. */
 
        rtnl_lock();
-       netif->status = CONNECTED;
-       wmb();
+       netif_carrier_on(netif->dev);
        if (netif_running(netif->dev))
                __netif_up(netif);
        rtnl_unlock();
@@ -295,20 +327,13 @@ static void netif_free(netif_t *netif)
 
 void netif_disconnect(netif_t *netif)
 {
-       switch (netif->status) {
-       case CONNECTED:
+       if (netif_carrier_ok(netif->dev)) {
                rtnl_lock();
-               netif->status = DISCONNECTING;
-               wmb();
+               netif_carrier_off(netif->dev);
                if (netif_running(netif->dev))
                        __netif_down(netif);
                rtnl_unlock();
                netif_put(netif);
-               /* fall through */
-       case DISCONNECTED:
-               netif_free(netif);
-               break;
-       default:
-               BUG();
-       }
-}
+       }
+       netif_free(netif);
+}
diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netback/loopback.c
--- a/drivers/xen/netback/loopback.c    Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netback/loopback.c    Sun Oct 08 12:28:37 2006 -0400
@@ -125,6 +125,11 @@ static struct ethtool_ops network_ethtoo
 {
        .get_tx_csum = ethtool_op_get_tx_csum,
        .set_tx_csum = ethtool_op_set_tx_csum,
+       .get_sg = ethtool_op_get_sg,
+       .set_sg = ethtool_op_set_sg,
+       .get_tso = ethtool_op_get_tso,
+       .set_tso = ethtool_op_set_tso,
+       .get_link = ethtool_op_get_link,
 };
 
 /*
@@ -152,6 +157,7 @@ static void loopback_construct(struct ne
 
        dev->features        = (NETIF_F_HIGHDMA |
                                NETIF_F_LLTX |
+                               NETIF_F_TSO |
                                NETIF_F_SG |
                                NETIF_F_IP_CSUM);
 
@@ -212,7 +218,7 @@ static int __init make_loopback(int i)
        return err;
 }
 
-static void __init clean_loopback(int i)
+static void __exit clean_loopback(int i)
 {
        struct net_device *dev1, *dev2;
        char dev_name[IFNAMSIZ];
diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netback/netback.c
--- a/drivers/xen/netback/netback.c     Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netback/netback.c     Sun Oct 08 12:28:37 2006 -0400
@@ -40,17 +40,23 @@
 
 /*#define NETBE_DEBUG_INTERRUPT*/
 
+struct netbk_rx_meta {
+       skb_frag_t frag;
+       int id;
+       int copy:1;
+};
+
 static void netif_idx_release(u16 pending_idx);
 static void netif_page_release(struct page *page);
 static void make_tx_response(netif_t *netif, 
-                            u16      id,
+                            netif_tx_request_t *txp,
                             s8       st);
-static int  make_rx_response(netif_t *netif, 
-                            u16      id, 
-                            s8       st,
-                            u16      offset,
-                            u16      size,
-                            u16      flags);
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+                                            u16      id, 
+                                            s8       st,
+                                            u16      offset,
+                                            u16      size,
+                                            u16      flags);
 
 static void net_tx_action(unsigned long unused);
 static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
@@ -63,13 +69,11 @@ static struct timer_list net_timer;
 #define MAX_PENDING_REQS 256
 
 static struct sk_buff_head rx_queue;
-static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
-static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
-static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE];
-static unsigned char rx_notify[NR_IRQS];
 
 static unsigned long mmap_vstart;
 #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
+
+static void *rx_mmap_area;
 
 #define PKT_PROT_LEN 64
 
@@ -97,27 +101,30 @@ static spinlock_t net_schedule_list_lock
 static spinlock_t net_schedule_list_lock;
 
 #define MAX_MFN_ALLOC 64
-static unsigned long mfn_list[MAX_MFN_ALLOC];
+static xen_pfn_t mfn_list[MAX_MFN_ALLOC];
 static unsigned int alloc_index = 0;
-static DEFINE_SPINLOCK(mfn_lock);
-
-static unsigned long alloc_mfn(void)
-{
-       unsigned long mfn = 0, flags;
+
+static inline unsigned long alloc_mfn(void)
+{
+       return mfn_list[--alloc_index];
+}
+
+static int check_mfn(int nr)
+{
        struct xen_memory_reservation reservation = {
-               .nr_extents   = MAX_MFN_ALLOC,
                .extent_order = 0,
                .domid        = DOMID_SELF
        };
-       set_xen_guest_handle(reservation.extent_start, (xen_pfn_t *)mfn_list);
-       spin_lock_irqsave(&mfn_lock, flags);
-       if ( unlikely(alloc_index == 0) )
-               alloc_index = HYPERVISOR_memory_op(
-                       XENMEM_increase_reservation, &reservation);
-       if ( alloc_index != 0 )
-               mfn = mfn_list[--alloc_index];
-       spin_unlock_irqrestore(&mfn_lock, flags);
-       return mfn;
+
+       if (likely(alloc_index >= nr))
+               return 0;
+
+       set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index);
+       reservation.nr_extents = MAX_MFN_ALLOC - alloc_index;
+       alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation,
+                                           &reservation);
+
+       return alloc_index >= nr ? 0 : -ENOMEM;
 }
 
 static inline void maybe_schedule_tx_action(void)
@@ -139,6 +146,123 @@ static inline int is_xen_skb(struct sk_b
        return (cp == skbuff_cachep);
 }
 
+/*
+ * We can flip without copying the packet unless:
+ *  1. The data is not allocated from our special cache; or
+ *  2. The main data area is shared; or
+ *  3. One or more fragments are shared; or
+ *  4. There are chained fragments.
+ */
+static inline int is_flippable_skb(struct sk_buff *skb)
+{
+       int frag;
+
+       if (!is_xen_skb(skb) || skb_cloned(skb))
+               return 0;
+
+       for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
+               if (page_count(skb_shinfo(skb)->frags[frag].page) > 1)
+                       return 0;
+       }
+
+       if (skb_shinfo(skb)->frag_list != NULL)
+               return 0;
+
+       return 1;
+}
+
+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
+       struct skb_shared_info *ninfo;
+       struct sk_buff *nskb;
+       unsigned long offset;
+       int ret;
+       int len;
+       int headlen;
+
+       BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
+       nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC);
+       if (unlikely(!nskb))
+               goto err;
+
+       skb_reserve(nskb, 16);
+       headlen = nskb->end - nskb->data;
+       if (headlen > skb_headlen(skb))
+               headlen = skb_headlen(skb);
+       ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
+       BUG_ON(ret);
+
+       ninfo = skb_shinfo(nskb);
+       ninfo->gso_size = skb_shinfo(skb)->gso_size;
+       ninfo->gso_type = skb_shinfo(skb)->gso_type;
+
+       offset = headlen;
+       len = skb->len - headlen;
+
+       nskb->len = skb->len;
+       nskb->data_len = len;
+       nskb->truesize += len;
+
+       while (len) {
+               struct page *page;
+               int copy;
+               int zero;
+
+               if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
+                       dump_stack();
+                       goto err_free;
+               }
+
+               copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
+               zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
+
+               page = alloc_page(GFP_ATOMIC | zero);
+               if (unlikely(!page))
+                       goto err_free;
+
+               ret = skb_copy_bits(skb, offset, page_address(page), copy);
+               BUG_ON(ret);
+
+               ninfo->frags[ninfo->nr_frags].page = page;
+               ninfo->frags[ninfo->nr_frags].page_offset = 0;
+               ninfo->frags[ninfo->nr_frags].size = copy;
+               ninfo->nr_frags++;
+
+               offset += copy;
+               len -= copy;
+       }
+
+       offset = nskb->data - skb->data;
+
+       nskb->h.raw = skb->h.raw + offset;
+       nskb->nh.raw = skb->nh.raw + offset;
+       nskb->mac.raw = skb->mac.raw + offset;
+
+       return nskb;
+
+ err_free:
+       kfree_skb(nskb);
+ err:
+       return NULL;
+}
+
+static inline int netbk_max_required_rx_slots(netif_t *netif)
+{
+       if (netif->features & (NETIF_F_SG|NETIF_F_TSO))
+               return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
+       return 1; /* all in one */
+}
+
+static inline int netbk_queue_full(netif_t *netif)
+{
+       RING_IDX peek   = netif->rx_req_cons_peek;
+       RING_IDX needed = netbk_max_required_rx_slots(netif);
+
+       return ((netif->rx.sring->req_prod - peek) < needed) ||
+              ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        netif_t *netif = netdev_priv(dev);
@@ -146,30 +270,24 @@ int netif_be_start_xmit(struct sk_buff *
        BUG_ON(skb->dev != dev);
 
        /* Drop the packet if the target domain has no receive buffers. */
-       if (!netif->active || 
-           (netif->rx_req_cons_peek == netif->rx.sring->req_prod) ||
-           ((netif->rx_req_cons_peek - netif->rx.rsp_prod_pvt) ==
-            NET_RX_RING_SIZE))
+       if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev)))
                goto drop;
 
-       /*
-        * We do not copy the packet unless:
-        *  1. The data is shared; or
-        *  2. The data is not allocated from our special cache.
-        * NB. We also couldn't cope with fragmented packets, but we won't get
-        *     any because we not advertise the NETIF_F_SG feature.
-        */
-       if (skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb)) {
-               int hlen = skb->data - skb->head;
-               int ret;
-               struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len);
+       if (unlikely(netbk_queue_full(netif))) {
+               /* Not a BUG_ON() -- misbehaving netfront can trigger this. */
+               if (netbk_can_queue(dev))
+                       DPRINTK("Queue full but not stopped!\n");
+               goto drop;
+       }
+
+       /* Copy the packet here if it's destined for a flipping
+          interface but isn't flippable (e.g. extra references to
+          data)
+       */
+       if (!netif->copying_receiver && !is_flippable_skb(skb)) {
+               struct sk_buff *nskb = netbk_copy_skb(skb);
                if ( unlikely(nskb == NULL) )
                        goto drop;
-               skb_reserve(nskb, hlen);
-               __skb_put(nskb, skb->len);
-               ret = skb_copy_bits(skb, -hlen, nskb->data - hlen,
-                                    skb->len + hlen);
-               BUG_ON(ret);
                /* Copy only the header fields we use in this driver. */
                nskb->dev = skb->dev;
                nskb->ip_summed = skb->ip_summed;
@@ -178,8 +296,17 @@ int netif_be_start_xmit(struct sk_buff *
                skb = nskb;
        }
 
-       netif->rx_req_cons_peek++;
+       netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
+                                  !!skb_shinfo(skb)->gso_size;
        netif_get(netif);
+
+       if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
+               netif->rx.sring->req_event = netif->rx_req_cons_peek +
+                       netbk_max_required_rx_slots(netif);
+               mb(); /* request notification /then/ check & stop the queue */
+               if (netbk_queue_full(netif))
+                       netif_stop_queue(dev);
+       }
 
        skb_queue_tail(&rx_queue, skb);
        tasklet_schedule(&net_rx_tasklet);
@@ -203,7 +330,7 @@ static void xen_network_done_notify(void
 /* 
  * Add following to poll() function in NAPI driver (Tigon3 is example):
  *  if ( xen_network_done() )
- *      tg3_enable_ints(tp); 
+ *      tg3_enable_ints(tp);
  */
 int xen_network_done(void)
 {
@@ -211,148 +338,371 @@ int xen_network_done(void)
 }
 #endif
 
-static void net_rx_action(unsigned long unused)
-{
-       netif_t *netif = NULL; 
-       s8 status;
-       u16 size, id, irq, flags;
+struct netrx_pending_operations {
+       unsigned trans_prod, trans_cons;
+       unsigned mmu_prod, mmu_cons;
+       unsigned mcl_prod, mcl_cons;
+       unsigned copy_prod, copy_cons;
+       unsigned meta_prod, meta_cons;
+       mmu_update_t *mmu;
+       gnttab_transfer_t *trans;
+       gnttab_copy_t *copy;
        multicall_entry_t *mcl;
+       struct netbk_rx_meta *meta;
+};
+
+/* Set up the grant operations for this fragment.  If it's a flipping
+   interface, we also set up the unmap request from here. */
+static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
+                         int i, struct netrx_pending_operations *npo,
+                         struct page *page, unsigned long size,
+                         unsigned long offset)
+{
        mmu_update_t *mmu;
        gnttab_transfer_t *gop;
-       unsigned long vdata, old_mfn, new_mfn;
+       gnttab_copy_t *copy_gop;
+       multicall_entry_t *mcl;
+       netif_rx_request_t *req;
+       unsigned long old_mfn, new_mfn;
+
+       old_mfn = virt_to_mfn(page_address(page));
+
+       req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
+       if (netif->copying_receiver) {
+               /* The fragment needs to be copied rather than
+                  flipped. */
+               meta->copy = 1;
+               copy_gop = npo->copy + npo->copy_prod++;
+               copy_gop->source.domid = DOMID_SELF;
+               copy_gop->source.offset = offset;
+               copy_gop->source.u.gmfn = old_mfn;
+               copy_gop->dest.domid = netif->domid;
+               copy_gop->dest.offset = 0;
+               copy_gop->dest.u.ref = req->gref;
+               copy_gop->len = size;
+               copy_gop->flags = GNTCOPY_dest_gref;
+       } else {
+               meta->copy = 0;
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       new_mfn = alloc_mfn();
+
+                       /*
+                        * Set the new P2M table entry before
+                        * reassigning the old data page. Heed the
+                        * comment in pgtable-2level.h:pte_page(). :-)
+                        */
+                       set_phys_to_machine(page_to_pfn(page), new_mfn);
+
+                       mcl = npo->mcl + npo->mcl_prod++;
+                       MULTI_update_va_mapping(mcl,
+                                            (unsigned long)page_address(page),
+                                            pfn_pte_ma(new_mfn, PAGE_KERNEL),
+                                            0);
+
+                       mmu = npo->mmu + npo->mmu_prod++;
+                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
+                               MMU_MACHPHYS_UPDATE;
+                       mmu->val = page_to_pfn(page);
+               }
+
+               gop = npo->trans + npo->trans_prod++;
+               gop->mfn = old_mfn;
+               gop->domid = netif->domid;
+               gop->ref = req->gref;
+       }
+       return req->id;
+}
+
+static void netbk_gop_skb(struct sk_buff *skb,
+                         struct netrx_pending_operations *npo)
+{
+       netif_t *netif = netdev_priv(skb->dev);
+       int nr_frags = skb_shinfo(skb)->nr_frags;
+       int i;
+       int extra;
+       struct netbk_rx_meta *head_meta, *meta;
+
+       head_meta = npo->meta + npo->meta_prod++;
+       head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
+       head_meta->frag.size = skb_shinfo(skb)->gso_size;
+       extra = !!head_meta->frag.size + 1;
+
+       for (i = 0; i < nr_frags; i++) {
+               meta = npo->meta + npo->meta_prod++;
+               meta->frag = skb_shinfo(skb)->frags[i];
+               meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
+                                         meta->frag.page,
+                                         meta->frag.size,
+                                         meta->frag.page_offset);
+       }
+
+       /*
+        * This must occur at the end to ensure that we don't trash
+        * skb_shinfo until we're done.
+        */
+       head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
+                                      virt_to_page(skb->data),
+                                      skb_headlen(skb),
+                                      offset_in_page(skb->data));
+
+       netif->rx.req_cons += nr_frags + extra;
+}
+
+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
+{
+       int i;
+
+       for (i = 0; i < nr_frags; i++)
+               put_page(meta[i].frag.page);
+}
+
+/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
+   used to set up the operations on the top of
+   netrx_pending_operations, which have since been done.  Check that
+   they didn't give any errors and advance over them. */
+static int netbk_check_gop(int nr_frags, domid_t domid,
+                          struct netrx_pending_operations *npo)
+{
+       multicall_entry_t *mcl;
+       gnttab_transfer_t *gop;
+       gnttab_copy_t     *copy_op;
+       int status = NETIF_RSP_OKAY;
+       int i;
+
+       for (i = 0; i <= nr_frags; i++) {
+               if (npo->meta[npo->meta_cons + i].copy) {
+                       copy_op = npo->copy + npo->copy_cons++;
+                       if (copy_op->status != GNTST_okay) {
+                               DPRINTK("Bad status %d from copy to DOM%d.\n",
+                                       gop->status, domid);
+                               status = NETIF_RSP_ERROR;
+                       }
+               } else {
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               mcl = npo->mcl + npo->mcl_cons++;
+                               /* The update_va_mapping() must not fail. */
+                               BUG_ON(mcl->result != 0);
+                       }
+
+                       gop = npo->trans + npo->trans_cons++;
+                       /* Check the reassignment error code. */
+                       if (gop->status != 0) {
+                               DPRINTK("Bad status %d from grant transfer to 
DOM%u\n",
+                                       gop->status, domid);
+                               /*
+                                * Page no longer belongs to us unless
+                                * GNTST_bad_page, but that should be
+                                * a fatal error anyway.
+                                */
+                               BUG_ON(gop->status == GNTST_bad_page);
+                               status = NETIF_RSP_ERROR;
+                       }
+               }
+       }
+
+       return status;
+}
+
+static void netbk_add_frag_responses(netif_t *netif, int status,
+                                    struct netbk_rx_meta *meta, int nr_frags)
+{
+       int i;
+       unsigned long offset;
+
+       for (i = 0; i < nr_frags; i++) {
+               int id = meta[i].id;
+               int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data;
+
+               if (meta[i].copy)
+                       offset = 0;
+               else
+                       offset = meta[i].frag.page_offset;
+               make_rx_response(netif, id, status, offset,
+                                meta[i].frag.size, flags);
+       }
+}
+
+static void net_rx_action(unsigned long unused)
+{
+       netif_t *netif = NULL;
+       s8 status;
+       u16 id, irq, flags;
+       netif_rx_response_t *resp;
+       multicall_entry_t *mcl;
        struct sk_buff_head rxq;
        struct sk_buff *skb;
-       u16 notify_list[NET_RX_RING_SIZE];
        int notify_nr = 0;
        int ret;
+       int nr_frags;
+       int count;
+       unsigned long offset;
+
+       /*
+        * Putting hundreds of bytes on the stack is considered rude.
+        * Static works because a tasklet can only be on one CPU at any time.
+        */
+       static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3];
+       static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
+       static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE];
+       static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE];
+       static unsigned char rx_notify[NR_IRQS];
+       static u16 notify_list[NET_RX_RING_SIZE];
+       static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+       struct netrx_pending_operations npo = {
+               mmu: rx_mmu,
+               trans: grant_trans_op,
+               copy: grant_copy_op,
+               mcl: rx_mcl,
+               meta: meta};
 
        skb_queue_head_init(&rxq);
 
-       mcl = rx_mcl;
-       mmu = rx_mmu;
-       gop = grant_rx_op;
+       count = 0;
 
        while ((skb = skb_dequeue(&rx_queue)) != NULL) {
-               netif   = netdev_priv(skb->dev);
-               vdata   = (unsigned long)skb->data;
-               old_mfn = virt_to_mfn(vdata);
-
-               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               nr_frags = skb_shinfo(skb)->nr_frags;
+               *(int *)skb->cb = nr_frags;
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+                   check_mfn(nr_frags + 1)) {
                        /* Memory squeeze? Back off for an arbitrary while. */
-                       if ((new_mfn = alloc_mfn()) == 0) {
-                               if ( net_ratelimit() )
-                                       WPRINTK("Memory squeeze in netback "
-                                               "driver.\n");
-                               mod_timer(&net_timer, jiffies + HZ);
-                               skb_queue_head(&rx_queue, skb);
-                               break;
-                       }
-                       /*
-                        * Set the new P2M table entry before reassigning
-                        * the old data page. Heed the comment in
-                        * pgtable-2level.h:pte_page(). :-)
-                        */
-                       set_phys_to_machine(
-                               __pa(skb->data) >> PAGE_SHIFT,
-                               new_mfn);
-
-                       MULTI_update_va_mapping(mcl, vdata,
-                                               pfn_pte_ma(new_mfn,
-                                                          PAGE_KERNEL), 0);
-                       mcl++;
-
-                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
-                               MMU_MACHPHYS_UPDATE;
-                       mmu->val = __pa(vdata) >> PAGE_SHIFT;
-                       mmu++;
-               }
-
-               gop->mfn = old_mfn;
-               gop->domid = netif->domid;
-               gop->ref = RING_GET_REQUEST(
-                       &netif->rx, netif->rx.req_cons)->gref;
-               netif->rx.req_cons++;
-               gop++;
+                       if ( net_ratelimit() )
+                               WPRINTK("Memory squeeze in netback "
+                                       "driver.\n");
+                       mod_timer(&net_timer, jiffies + HZ);
+                       skb_queue_head(&rx_queue, skb);
+                       break;
+               }
+
+               netbk_gop_skb(skb, &npo);
+
+               count += nr_frags + 1;
 
                __skb_queue_tail(&rxq, skb);
 
                /* Filled the batch queue? */
-               if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op))
+               if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
                        break;
        }
 
-       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-               if (mcl == rx_mcl)
-                       return;
-
+       if (npo.mcl_prod &&
+           !xen_feature(XENFEAT_auto_translated_physmap)) {
+               mcl = npo.mcl + npo.mcl_prod++;
+
+               BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
                mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
 
-               if (mmu - rx_mmu) {
-                       mcl->op = __HYPERVISOR_mmu_update;
-                       mcl->args[0] = (unsigned long)rx_mmu;
-                       mcl->args[1] = mmu - rx_mmu;
-                       mcl->args[2] = 0;
-                       mcl->args[3] = DOMID_SELF;
-                       mcl++;
-               }
-
-               ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
-               BUG_ON(ret != 0);
-       }
-
-       ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, 
-                                       gop - grant_rx_op);
+               mcl->op = __HYPERVISOR_mmu_update;
+               mcl->args[0] = (unsigned long)rx_mmu;
+               mcl->args[1] = npo.mmu_prod;
+               mcl->args[2] = 0;
+               mcl->args[3] = DOMID_SELF;
+       }
+
+       if (npo.trans_prod) {
+               mcl = npo.mcl + npo.mcl_prod++;
+               mcl->op = __HYPERVISOR_grant_table_op;
+               mcl->args[0] = GNTTABOP_transfer;
+               mcl->args[1] = (unsigned long)grant_trans_op;
+               mcl->args[2] = npo.trans_prod;
+       }
+
+       if (npo.copy_prod) {
+               mcl = npo.mcl + npo.mcl_prod++;
+               mcl->op = __HYPERVISOR_grant_table_op;
+               mcl->args[0] = GNTTABOP_copy;
+               mcl->args[1] = (unsigned long)grant_copy_op;
+               mcl->args[2] = npo.copy_prod;
+       }
+
+       /* Nothing to do? */
+       if (!npo.mcl_prod)
+               return;
+
+       BUG_ON(npo.copy_prod > NET_RX_RING_SIZE);
+       BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE);
+       BUG_ON(npo.trans_prod > NET_RX_RING_SIZE);
+       BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3);
+       BUG_ON(npo.meta_prod > NET_RX_RING_SIZE);
+
+       ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
        BUG_ON(ret != 0);
 
-       mcl = rx_mcl;
-       gop = grant_rx_op;
        while ((skb = __skb_dequeue(&rxq)) != NULL) {
-               netif   = netdev_priv(skb->dev);
-               size    = skb->tail - skb->data;
-
-               atomic_set(&(skb_shinfo(skb)->dataref), 1);
-               skb_shinfo(skb)->nr_frags = 0;
-               skb_shinfo(skb)->frag_list = NULL;
-
-               netif->stats.tx_bytes += size;
+               nr_frags = *(int *)skb->cb;
+
+               netif = netdev_priv(skb->dev);
+               /* We can't rely on skb_release_data to release the
+                  pages used by fragments for us, since it tries to
+                  touch the pages in the fraglist.  If we're in
+                  flipping mode, that doesn't work.  In copying mode,
+                  we still have access to all of the pages, and so
+                  it's safe to let release_data deal with it. */
+               /* (Freeing the fragments is safe since we copy
+                  non-linear skbs destined for flipping interfaces) */
+               if (!netif->copying_receiver) {
+                       atomic_set(&(skb_shinfo(skb)->dataref), 1);
+                       skb_shinfo(skb)->frag_list = NULL;
+                       skb_shinfo(skb)->nr_frags = 0;
+                       netbk_free_pages(nr_frags, meta + npo.meta_cons + 1);
+               }
+
+               netif->stats.tx_bytes += skb->len;
                netif->stats.tx_packets++;
 
-               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-                       /* The update_va_mapping() must not fail. */
-                       BUG_ON(mcl->result != 0);
-                       mcl++;
-               }
-
-               /* Check the reassignment error code. */
-               status = NETIF_RSP_OKAY;
-               if (gop->status != 0) { 
-                       DPRINTK("Bad status %d from grant transfer to DOM%u\n",
-                               gop->status, netif->domid);
-                       /*
-                        * Page no longer belongs to us unless GNTST_bad_page,
-                        * but that should be a fatal error anyway.
-                        */
-                       BUG_ON(gop->status == GNTST_bad_page);
-                       status = NETIF_RSP_ERROR; 
-               }
-               irq = netif->irq;
-               id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id;
-               flags = 0;
+               status = netbk_check_gop(nr_frags, netif->domid, &npo);
+
+               id = meta[npo.meta_cons].id;
+               flags = nr_frags ? NETRXF_more_data : 0;
+
                if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
                        flags |= NETRXF_csum_blank | NETRXF_data_validated;
                else if (skb->proto_data_valid) /* remote but checksummed? */
                        flags |= NETRXF_data_validated;
-               if (make_rx_response(netif, id, status,
-                                    (unsigned long)skb->data & ~PAGE_MASK,
-                                    size, flags) &&
-                   (rx_notify[irq] == 0)) {
+
+               if (meta[npo.meta_cons].copy)
+                       offset = 0;
+               else
+                       offset = offset_in_page(skb->data);
+               resp = make_rx_response(netif, id, status, offset,
+                                       skb_headlen(skb), flags);
+
+               if (meta[npo.meta_cons].frag.size) {
+                       struct netif_extra_info *gso =
+                               (struct netif_extra_info *)
+                               RING_GET_RESPONSE(&netif->rx,
+                                                 netif->rx.rsp_prod_pvt++);
+
+                       resp->flags |= NETRXF_extra_info;
+
+                       gso->u.gso.size = meta[npo.meta_cons].frag.size;
+                       gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+                       gso->u.gso.pad = 0;
+                       gso->u.gso.features = 0;
+
+                       gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+                       gso->flags = 0;
+               }
+
+               netbk_add_frag_responses(netif, status,
+                                        meta + npo.meta_cons + 1,
+                                        nr_frags);
+
+               RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
+               irq = netif->irq;
+               if (ret && !rx_notify[irq]) {
                        rx_notify[irq] = 1;
                        notify_list[notify_nr++] = irq;
                }
 
+               if (netif_queue_stopped(netif->dev) &&
+                   !netbk_queue_full(netif))
+                       netif_wake_queue(netif->dev);
+
                netif_put(netif);
                dev_kfree_skb(skb);
-               gop++;
+               npo.meta_cons += nr_frags + 1;
        }
 
        while (notify_nr != 0) {
@@ -403,7 +753,9 @@ static void add_to_net_schedule_list_tai
                return;
 
        spin_lock_irq(&net_schedule_list_lock);
-       if (!__on_net_schedule_list(netif) && netif->active) {
+       if (!__on_net_schedule_list(netif) &&
+           likely(netif_running(netif->dev) &&
+                  netif_carrier_ok(netif->dev))) {
                list_add_tail(&netif->list, &net_schedule_list);
                netif_get(netif);
        }
@@ -481,7 +833,7 @@ inline static void net_tx_action_dealloc
 
                netif = pending_tx_info[pending_idx].netif;
 
-               make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
+               make_tx_response(netif, &pending_tx_info[pending_idx].req, 
                                 NETIF_RSP_OKAY);
 
                pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
@@ -490,14 +842,16 @@ inline static void net_tx_action_dealloc
        }
 }
 
-static void netbk_tx_err(netif_t *netif, RING_IDX end)
+static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
 {
        RING_IDX cons = netif->tx.req_cons;
 
        do {
-               netif_tx_request_t *txp = RING_GET_REQUEST(&netif->tx, cons);
-               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
-       } while (++cons < end);
+               make_tx_response(netif, txp, NETIF_RSP_ERROR);
+               if (cons >= end)
+                       break;
+               txp = RING_GET_REQUEST(&netif->tx, cons++);
+       } while (1);
        netif->tx.req_cons = cons;
        netif_schedule_work(netif);
        netif_put(netif);
@@ -508,7 +862,7 @@ static int netbk_count_requests(netif_t 
 {
        netif_tx_request_t *first = txp;
        RING_IDX cons = netif->tx.req_cons;
-       int frags = 1;
+       int frags = 0;
 
        while (txp->flags & NETTXF_more_data) {
                if (frags >= work_to_do) {
@@ -543,7 +897,7 @@ static gnttab_map_grant_ref_t *netbk_get
        skb_frag_t *frags = shinfo->frags;
        netif_tx_request_t *txp;
        unsigned long pending_idx = *((u16 *)skb->data);
-       RING_IDX cons = netif->tx.req_cons + 1;
+       RING_IDX cons = netif->tx.req_cons;
        int i, start;
 
        /* Skip first skb fragment if it is on same page as header fragment. */
@@ -581,7 +935,7 @@ static int netbk_tx_check_mop(struct sk_
        err = mop->status;
        if (unlikely(err)) {
                txp = &pending_tx_info[pending_idx].req;
-               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               make_tx_response(netif, txp, NETIF_RSP_ERROR);
                pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
                netif_put(netif);
        } else {
@@ -614,7 +968,7 @@ static int netbk_tx_check_mop(struct sk_
 
                /* Error on this fragment: respond to client with an error. */
                txp = &pending_tx_info[pending_idx].req;
-               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               make_tx_response(netif, txp, NETIF_RSP_ERROR);
                pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
                netif_put(netif);
 
@@ -661,6 +1015,57 @@ static void netbk_fill_frags(struct sk_b
        }
 }
 
+int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
+                    int work_to_do)
+{
+       struct netif_extra_info *extra;
+       RING_IDX cons = netif->tx.req_cons;
+
+       do {
+               if (unlikely(work_to_do-- <= 0)) {
+                       DPRINTK("Missing extra info\n");
+                       return -EBADR;
+               }
+
+               extra = (struct netif_extra_info *)
+                       RING_GET_REQUEST(&netif->tx, cons);
+               if (unlikely(!extra->type ||
+                            extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+                       netif->tx.req_cons = ++cons;
+                       DPRINTK("Invalid extra type: %d\n", extra->type);
+                       return -EINVAL;
+               }
+
+               memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
+               netif->tx.req_cons = ++cons;
+       } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+       return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
+{
+       if (!gso->u.gso.size) {
+               DPRINTK("GSO size must not be zero.\n");
+               return -EINVAL;
+       }
+
+       /* Currently only TCPv4 S.O. is supported. */
+       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+               DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+               return -EINVAL;
+       }
+
+       skb_shinfo(skb)->gso_size = gso->u.gso.size;
+       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+       /* Header must be checked, and gso_segs computed. */
+       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+       skb_shinfo(skb)->gso_segs = 0;
+
+       return 0;
+}
+
 /* Called after netfront has transmitted */
 static void net_tx_action(unsigned long unused)
 {
@@ -668,6 +1073,7 @@ static void net_tx_action(unsigned long 
        struct sk_buff *skb;
        netif_t *netif;
        netif_tx_request_t txreq;
+       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
        u16 pending_idx;
        RING_IDX i;
        gnttab_map_grant_ref_t *mop;
@@ -726,23 +1132,37 @@ static void net_tx_action(unsigned long 
                }
                netif->remaining_credit -= txreq.size;
 
+               work_to_do--;
+               netif->tx.req_cons = ++i;
+
+               memset(extras, 0, sizeof(extras));
+               if (txreq.flags & NETTXF_extra_info) {
+                       work_to_do = netbk_get_extras(netif, extras,
+                                                     work_to_do);
+                       i = netif->tx.req_cons;
+                       if (unlikely(work_to_do < 0)) {
+                               netbk_tx_err(netif, &txreq, i);
+                               continue;
+                       }
+               }
+
                ret = netbk_count_requests(netif, &txreq, work_to_do);
                if (unlikely(ret < 0)) {
-                       netbk_tx_err(netif, i - ret);
+                       netbk_tx_err(netif, &txreq, i - ret);
                        continue;
                }
                i += ret;
 
-               if (unlikely(ret > MAX_SKB_FRAGS + 1)) {
+               if (unlikely(ret > MAX_SKB_FRAGS)) {
                        DPRINTK("Too many frags\n");
-                       netbk_tx_err(netif, i);
+                       netbk_tx_err(netif, &txreq, i);
                        continue;
                }
 
                if (unlikely(txreq.size < ETH_HLEN)) {
                        DPRINTK("Bad packet size: %d\n", txreq.size);
-                       netbk_tx_err(netif, i);
-                       continue; 
+                       netbk_tx_err(netif, &txreq, i);
+                       continue;
                }
 
                /* No crossing a page as the payload mustn't fragment. */
@@ -750,25 +1170,36 @@ static void net_tx_action(unsigned long 
                        DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
                                txreq.offset, txreq.size, 
                                (txreq.offset &~PAGE_MASK) + txreq.size);
-                       netbk_tx_err(netif, i);
+                       netbk_tx_err(netif, &txreq, i);
                        continue;
                }
 
                pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
 
                data_len = (txreq.size > PKT_PROT_LEN &&
-                           ret < MAX_SKB_FRAGS + 1) ?
+                           ret < MAX_SKB_FRAGS) ?
                        PKT_PROT_LEN : txreq.size;
 
                skb = alloc_skb(data_len+16, GFP_ATOMIC);
                if (unlikely(skb == NULL)) {
                        DPRINTK("Can't allocate a skb in start_xmit.\n");
-                       netbk_tx_err(netif, i);
+                       netbk_tx_err(netif, &txreq, i);
                        break;
                }
 
                /* Packets passed to netif_rx() must have some headroom. */
                skb_reserve(skb, 16);
+
+               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+                       struct netif_extra_info *gso;
+                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+                       if (netbk_set_skb_gso(skb, gso)) {
+                               kfree_skb(skb);
+                               netbk_tx_err(netif, &txreq, i);
+                               continue;
+                       }
+               }
 
                gnttab_set_map_op(mop, MMAP_VADDR(pending_idx),
                                  GNTMAP_host_map | GNTMAP_readonly,
@@ -782,11 +1213,14 @@ static void net_tx_action(unsigned long 
 
                __skb_put(skb, data_len);
 
-               skb_shinfo(skb)->nr_frags = ret - 1;
+               skb_shinfo(skb)->nr_frags = ret;
                if (data_len < txreq.size) {
                        skb_shinfo(skb)->nr_frags++;
                        skb_shinfo(skb)->frags[0].page =
                                (void *)(unsigned long)pending_idx;
+               } else {
+                       /* Discriminate from any valid pending_idx value. */
+                       skb_shinfo(skb)->frags[0].page = (void *)~0UL;
                }
 
                __skb_queue_tail(&tx_queue, skb);
@@ -884,21 +1318,32 @@ static void netif_page_release(struct pa
        u16 pending_idx = page - virt_to_page(mmap_vstart);
 
        /* Ready for next use. */
-       init_page_count(page);
+       set_page_count(page, 1);
 
        netif_idx_release(pending_idx);
 }
 
+static void netif_rx_page_release(struct page *page)
+{
+       /* Ready for next use. */
+       set_page_count(page, 1);
+}
+
 irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
        netif_t *netif = dev_id;
+
        add_to_net_schedule_list_tail(netif);
        maybe_schedule_tx_action();
+
+       if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif))
+               netif_wake_queue(netif->dev);
+
        return IRQ_HANDLED;
 }
 
 static void make_tx_response(netif_t *netif, 
-                            u16      id,
+                            netif_tx_request_t *txp,
                             s8       st)
 {
        RING_IDX i = netif->tx.rsp_prod_pvt;
@@ -906,8 +1351,11 @@ static void make_tx_response(netif_t *ne
        int notify;
 
        resp = RING_GET_RESPONSE(&netif->tx, i);
-       resp->id     = id;
+       resp->id     = txp->id;
        resp->status = st;
+
+       if (txp->flags & NETTXF_extra_info)
+               RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
 
        netif->tx.rsp_prod_pvt = ++i;
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
@@ -924,16 +1372,15 @@ static void make_tx_response(netif_t *ne
 #endif
 }
 
-static int make_rx_response(netif_t *netif, 
-                           u16      id, 
-                           s8       st,
-                           u16      offset,
-                           u16      size,
-                           u16      flags)
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+                                            u16      id, 
+                                            s8       st,
+                                            u16      offset,
+                                            u16      size,
+                                            u16      flags)
 {
        RING_IDX i = netif->rx.rsp_prod_pvt;
        netif_rx_response_t *resp;
-       int notify;
 
        resp = RING_GET_RESPONSE(&netif->rx, i);
        resp->offset     = offset;
@@ -944,9 +1391,8 @@ static int make_rx_response(netif_t *net
                resp->status = (s16)st;
 
        netif->rx.rsp_prod_pvt = ++i;
-       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify);
-
-       return notify;
+
+       return resp;
 }
 
 #ifdef NETBE_DEBUG_INTERRUPT
@@ -1002,13 +1448,25 @@ static int __init netback_init(void)
        net_timer.function = net_alarm;
     
        page = balloon_alloc_empty_page_range(MAX_PENDING_REQS);
-       BUG_ON(page == NULL);
+       if (page == NULL)
+               return -ENOMEM;
+
        mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 
        for (i = 0; i < MAX_PENDING_REQS; i++) {
                page = virt_to_page(MMAP_VADDR(i));
-               init_page_count(page);
+               set_page_count(page, 1);
                SetPageForeign(page, netif_page_release);
+       }
+
+       page = balloon_alloc_empty_page_range(NET_RX_RING_SIZE);
+       BUG_ON(page == NULL);
+       rx_mmap_area = pfn_to_kaddr(page_to_pfn(page));
+
+       for (i = 0; i < NET_RX_RING_SIZE; i++) {
+               page = virt_to_page(rx_mmap_area + (i * PAGE_SIZE));
+               set_page_count(page, 1);
+               SetPageForeign(page, netif_rx_page_release);
        }
 
        pending_cons = 0;
diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netback/xenbus.c
--- a/drivers/xen/netback/xenbus.c      Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netback/xenbus.c      Sun Oct 08 12:28:37 2006 -0400
@@ -101,6 +101,19 @@ static int netback_probe(struct xenbus_d
                        goto abort_transaction;
                }
 
+               err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+                                   "%d", 1);
+               if (err) {
+                       message = "writing feature-gso-tcpv4";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_printf(xbt, dev->nodename, "feature-rx-copy", 
"%d", 1);
+               if (err) {
+                       message = "writing feature-copying";
+                       goto abort_transaction;
+               }
+
                err = xenbus_transaction_end(xbt, 0);
        } while (err == -EAGAIN);
 
@@ -215,16 +228,31 @@ static void frontend_changed(struct xenb
 {
        struct backend_info *be = dev->dev.driver_data;
 
-       DPRINTK("");
+       DPRINTK("%s", xenbus_strstate(frontend_state));
 
        be->frontend_state = frontend_state;
 
        switch (frontend_state) {
        case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       printk("%s: %s: prepare for reconnect\n",
+                              __FUNCTION__, dev->nodename);
+                       if (be->netif) {
+                               netif_disconnect(be->netif);
+                               be->netif = NULL;
+                       }
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
        case XenbusStateInitialised:
                break;
 
        case XenbusStateConnected:
+               if (!be->netif) {
+                       /* reconnect: setup be->netif */
+                       backend_changed(&be->backend_watch, NULL, 0);
+               }
                maybe_connect(be);
                break;
 
@@ -233,13 +261,18 @@ static void frontend_changed(struct xenb
                break;
 
        case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+#ifdef JX
+               if (xenbus_dev_is_online(dev))
+                       break;
+#endif
+               /* fall through if not online */
+       case XenbusStateUnknown:
                if (be->netif != NULL)
                        kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
                device_unregister(&dev->dev);
                break;
 
-       case XenbusStateUnknown:
-       case XenbusStateInitWait:
        default:
                xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
                                 frontend_state);
@@ -342,8 +375,9 @@ static int connect_rings(struct backend_
 {
        struct xenbus_device *dev = be->dev;
        unsigned long tx_ring_ref, rx_ring_ref;
-       unsigned int evtchn;
+       unsigned int evtchn, rx_copy;
        int err;
+       int val;
 
        DPRINTK("");
 
@@ -356,6 +390,51 @@ static int connect_rings(struct backend_
                                 "reading %s/ring-ref and event-channel",
                                 dev->otherend);
                return err;
+       }
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+                          &rx_copy);
+       if (err == -ENOENT) {
+               err = 0;
+               rx_copy = 0;
+       }
+       if (err < 0) {
+               xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+                                dev->otherend);
+               return err;
+       }
+       be->netif->copying_receiver = !!rx_copy;
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-notify", "%d",
+                        &val) < 0)
+               val = 0;
+       if (val)
+               be->netif->can_queue = 1;
+       else
+               /* Must be non-zero for pfifo_fast to work. */
+               be->netif->dev->tx_queue_len = 1;
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
+               val = 0;
+       if (val) {
+               be->netif->features |= NETIF_F_SG;
+               be->netif->dev->features |= NETIF_F_SG;
+       }
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
+                        &val) < 0)
+               val = 0;
+       if (val) {
+               be->netif->features |= NETIF_F_TSO;
+               be->netif->dev->features |= NETIF_F_TSO;
+       }
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+                        "%d", &val) < 0)
+               val = 0;
+       if (val) {
+               be->netif->features &= ~NETIF_F_IP_CSUM;
+               be->netif->dev->features &= ~NETIF_F_IP_CSUM;
        }
 
        /* Map the shared frame, irq etc. */
diff -r fe1a31c06cbe -r 11ee20d418ea drivers/xen/netfront/netfront.c
--- a/drivers/xen/netfront/netfront.c   Sun Oct 08 12:23:50 2006 -0400
+++ b/drivers/xen/netfront/netfront.c   Sun Oct 08 12:28:37 2006 -0400
@@ -46,11 +46,11 @@
 #include <linux/ethtool.h>
 #include <linux/in.h>
 #include <linux/if_ether.h>
+#include <linux/io.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/arp.h>
 #include <net/route.h>
-#include <asm/io.h>
 #include <asm/uaccess.h>
 #include <xen/evtchn.h>
 #include <xen/xenbus.h>
@@ -58,21 +58,31 @@
 #include <xen/interface/memory.h>
 #include <xen/balloon.h>
 #include <asm/page.h>
+#include <asm/maddr.h>
 #include <asm/uaccess.h>
 #include <xen/interface/grant_table.h>
 #include <xen/gnttab.h>
 
+#define RX_COPY_THRESHOLD 256
+
+/* If we don't have GSO, fake things up so that we never try to use it. */
+#ifndef NETIF_F_GSO
+#define netif_needs_gso(dev, skb)      0
+#define dev_disable_gso_features(dev)  ((void)0)
+#else
+#define HAVE_GSO                       1
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+       /* Turn off all GSO bits except ROBUST. */
+       dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+       dev->features |= NETIF_F_GSO_ROBUST;
+}
+#endif
+
 #define GRANT_INVALID_REF      0
 
 #define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
 #define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
-
-static inline void init_skb_shinfo(struct sk_buff *skb)
-{
-       atomic_set(&(skb_shinfo(skb)->dataref), 1);
-       skb_shinfo(skb)->nr_frags = 0;
-       skb_shinfo(skb)->frag_list = NULL;
-}
 
 struct netfront_info {
        struct list_head list;
@@ -88,6 +98,7 @@ struct netfront_info {
 
        unsigned int handle;
        unsigned int evtchn, irq;
+       unsigned int copying_receiver;
 
        /* Receive-ring batched refills. */
 #define RX_MIN_TARGET 8
@@ -99,30 +110,35 @@ struct netfront_info {
        struct timer_list rx_refill_timer;
 
        /*
-        * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
-        * array is an index into a chain of free entries.
+        * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
+        * is an index into a chain of free entries.
         */
        struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
-       struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
+       struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
 
 #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
        grant_ref_t gref_tx_head;
        grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
        grant_ref_t gref_rx_head;
-       grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
+       grant_ref_t grant_rx_ref[NET_TX_RING_SIZE];
 
        struct xenbus_device *xbdev;
        int tx_ring_ref;
        int rx_ring_ref;
        u8 mac[ETH_ALEN];
 
-       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+       xen_pfn_t rx_pfn_array[NET_RX_RING_SIZE];
        struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
        struct mmu_update rx_mmu[NET_RX_RING_SIZE];
 };
 
+struct netfront_rx_info {
+       struct netif_rx_response rx;
+       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+
 /*
- * Access macros for acquiring freeing slots in {tx,rx}_skbs[].
+ * Access macros for acquiring freeing slots in tx_skbs[].
  */
 
 static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
@@ -136,6 +152,29 @@ static inline unsigned short get_id_from
        unsigned int id = (unsigned int)(unsigned long)list[0];
        list[0] = list[id];
        return id;
+}
+
+static inline int xennet_rxidx(RING_IDX idx)
+{
+       return idx & (NET_RX_RING_SIZE - 1);
+}
+
+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+                                               RING_IDX ri)
+{
+       int i = xennet_rxidx(ri);
+       struct sk_buff *skb = np->rx_skbs[i];
+       np->rx_skbs[i] = NULL;
+       return skb;
+}
+
+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+                                           RING_IDX ri)
+{
+       int i = xennet_rxidx(ri);
+       grant_ref_t ref = np->grant_rx_ref[i];
+       np->grant_rx_ref[i] = GRANT_INVALID_REF;
+       return ref;
 }
 
 #define DPRINTK(fmt, args...)                          \
@@ -148,12 +187,13 @@ static inline unsigned short get_id_from
 
 static int talk_to_backend(struct xenbus_device *, struct netfront_info *);
 static int setup_device(struct xenbus_device *, struct netfront_info *);
-static struct net_device *create_netdev(int, struct xenbus_device *);
+static struct net_device *create_netdev(int, int, struct xenbus_device *);
 
 static void netfront_closing(struct xenbus_device *);
 
 static void end_access(int, void *);
 static void netif_disconnect_backend(struct netfront_info *);
+static int open_netdev(struct netfront_info *);
 static void close_netdev(struct netfront_info *);
 static void netif_free(struct netfront_info *);
 
@@ -190,6 +230,7 @@ static int __devinit netfront_probe(stru
        struct net_device *netdev;
        struct netfront_info *info;
        unsigned int handle;
+       unsigned feature_rx_copy;
 
        err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%u", &handle);
        if (err != 1) {
@@ -197,7 +238,24 @@ static int __devinit netfront_probe(stru
                return err;
        }
 
-       netdev = create_netdev(handle, dev);
+#ifdef CONFIG_PPC_XEN
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-copy", "%u",
+                          &feature_rx_copy);
+       BUG_ON(err != 1);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading feature-rx-copy");
+               return err;
+       }
+       BUG_ON(!feature_rx_copy);
+       if (!feature_rx_copy) {
+               xenbus_dev_fatal(dev, 0, "need a copy-capable backend");
+               return -EINVAL;
+       }
+#else
+       feature_rx_copy = 0;
+#endif
+
+       netdev = create_netdev(handle, feature_rx_copy, dev);
        if (IS_ERR(netdev)) {
                err = PTR_ERR(netdev);
                xenbus_dev_fatal(dev, err, "creating netdev");
@@ -208,15 +266,22 @@ static int __devinit netfront_probe(stru
        dev->dev.driver_data = info;
 
        err = talk_to_backend(dev, info);
-       if (err) {
-               xennet_sysfs_delif(info->netdev);
-               unregister_netdev(netdev);
-               free_netdev(netdev);
-               dev->dev.driver_data = NULL;
-               return err;
-       }
+       if (err)
+               goto fail_backend;
+
+       err = open_netdev(info);
+       if (err)
+               goto fail_open;
 
        return 0;
+
+ fail_open:
+       xennet_sysfs_delif(info->netdev);
+       unregister_netdev(netdev);
+ fail_backend:
+       free_netdev(netdev);
+       dev->dev.driver_data = NULL;
+       return err;
 }
 
 
@@ -303,6 +368,33 @@ again:
                goto abort_transaction;
        }
 
+       err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+                           info->copying_receiver);
+       if (err) {
+               message = "writing request-rx-copy";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+       if (err) {
+               message = "writing feature-rx-notify";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+       if (err) {
+               message = "writing feature-sg";
+               goto abort_transaction;
+       }
+
+#ifdef HAVE_GSO
+       err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
+       if (err) {
+               message = "writing feature-gso-tcpv4";
+               goto abort_transaction;
+       }
+#endif
+
        err = xenbus_transaction_end(xbt, 0);
        if (err) {
                if (err == -EAGAIN)
@@ -374,7 +466,8 @@ static int setup_device(struct xenbus_de
 
        memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
        err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
-                                       SA_SAMPLE_RANDOM, netdev->name, netdev);
+                                       SA_SAMPLE_RANDOM, netdev->name,
+                                       netdev);
        if (err < 0)
                goto fail;
        info->irq = err;
@@ -395,7 +488,7 @@ static void backend_changed(struct xenbu
        struct netfront_info *np = dev->dev.driver_data;
        struct net_device *netdev = np->netdev;
 
-       DPRINTK("\n");
+       DPRINTK("%s\n", xenbus_strstate(backend_state));
 
        switch (backend_state) {
        case XenbusStateInitialising:
@@ -453,8 +546,14 @@ static int network_open(struct net_devic
 
        memset(&np->stats, 0, sizeof(np->stats));
 
-       network_alloc_rx_buffers(dev);
-       np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+       spin_lock(&np->rx_lock);
+       if (netif_carrier_ok(dev)) {
+               network_alloc_rx_buffers(dev);
+               np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+                       netif_rx_schedule(dev);
+       }
+       spin_unlock(&np->rx_lock);
 
        netif_start_queue(dev);
 
@@ -463,7 +562,7 @@ static int network_open(struct net_devic
 
 static inline int netfront_tx_slot_available(struct netfront_info *np)
 {
-       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 1;
+       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
 }
 
 static inline void network_maybe_wake_tx(struct net_device *dev)
@@ -483,15 +582,20 @@ static void network_tx_buf_gc(struct net
        struct netfront_info *np = netdev_priv(dev);
        struct sk_buff *skb;
 
-       if (unlikely(!netif_carrier_ok(dev)))
-               return;
+       BUG_ON(!netif_carrier_ok(dev));
 
        do {
                prod = np->tx.sring->rsp_prod;
                rmb(); /* Ensure we see responses up to 'rp'. */
 
                for (cons = np->tx.rsp_cons; cons != prod; cons++) {
-                       id  = RING_GET_RESPONSE(&np->tx, cons)->id;
+                       struct netif_tx_response *txrsp;
+
+                       txrsp = RING_GET_RESPONSE(&np->tx, cons);
+                       if (txrsp->status == NETIF_RSP_NULL)
+                               continue;
+
+                       id  = txrsp->id;
                        skb = np->tx_skbs[id];
                        if (unlikely(gnttab_query_foreign_access(
                                np->grant_tx_ref[id]) != 0)) {
@@ -540,10 +644,15 @@ static void network_alloc_rx_buffers(str
        unsigned short id;
        struct netfront_info *np = netdev_priv(dev);
        struct sk_buff *skb;
-       int i, batch_target;
+       struct page *page;
+       int i, batch_target, notify;
        RING_IDX req_prod = np->rx.req_prod_pvt;
        struct xen_memory_reservation reservation;
        grant_ref_t ref;
+       unsigned long pfn;
+       void *vaddr;
+       int nr_flips;
+       netif_rx_request_t *req;
 
        if (unlikely(!netif_carrier_ok(dev)))
                return;
@@ -557,28 +666,41 @@ static void network_alloc_rx_buffers(str
        batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
        for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
                /*
-                * Subtract dev_alloc_skb headroom (16 bytes) and shared info
-                * tailroom then round down to SKB_DATA_ALIGN boundary.
+                * Allocate an skb and a page. Do not use __dev_alloc_skb as
+                * that will allocate page-sized buffers which is not
+                * necessary here.
+                * 16 bytes added as necessary headroom for netif_receive_skb.
                 */
-               skb = __dev_alloc_skb(
-                       ((PAGE_SIZE - sizeof(struct skb_shared_info)) &
-                        (-SKB_DATA_ALIGN(1))) - 16,
-                       GFP_ATOMIC|__GFP_NOWARN);
-               if (skb == NULL) {
+               skb = alloc_skb(RX_COPY_THRESHOLD + 16,
+                               GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(!skb))
+                       goto no_skb;
+
+               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+               if (!page) {
+                       kfree_skb(skb);
+no_skb:
                        /* Any skbuffs queued for refill? Force them out. */
                        if (i != 0)
                                goto refill;
                        /* Could not allocate any skbuffs. Try again later. */
                        mod_timer(&np->rx_refill_timer,
                                  jiffies + (HZ/10));
-                       return;
+                       break;
                }
+
+               skb_reserve(skb, 16); /* mimic dev_alloc_skb() */
+               skb_shinfo(skb)->frags[0].page = page;
+               skb_shinfo(skb)->nr_frags = 1;
                __skb_queue_tail(&np->rx_batch, skb);
        }
 
        /* Is the batch large enough to be worthwhile? */
-       if (i < (np->rx_target/2))
+       if (i < (np->rx_target/2)) {
+               if (req_prod > np->rx.sring->req_prod)
+                       goto push;
                return;
+       }
 
        /* Adjust our fill target if we risked running out of buffers. */
        if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
@@ -586,70 +708,93 @@ static void network_alloc_rx_buffers(str
                np->rx_target = np->rx_max_target;
 
  refill:
-       for (i = 0; ; i++) {
+       for (nr_flips = i = 0; ; i++) {
                if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
                        break;
 
                skb->dev = dev;
 
-               id = get_id_from_freelist(np->rx_skbs);
-
+               id = xennet_rxidx(req_prod + i);
+
+               BUG_ON(np->rx_skbs[id]);
                np->rx_skbs[id] = skb;
 
-               RING_GET_REQUEST(&np->rx, req_prod + i)->id = id;
                ref = gnttab_claim_grant_reference(&np->gref_rx_head);
                BUG_ON((signed short)ref < 0);
                np->grant_rx_ref[id] = ref;
-               gnttab_grant_foreign_transfer_ref(ref,
-                                                 np->xbdev->otherend_id,
-                                                 __pa(skb->head)>>PAGE_SHIFT);
-               RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
-               np->rx_pfn_array[i] = virt_to_mfn(skb->head);
+
+               pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
+               vaddr = page_address(skb_shinfo(skb)->frags[0].page);
+
+               req = RING_GET_REQUEST(&np->rx, req_prod + i);
+               if (!np->copying_receiver) {
+                       gnttab_grant_foreign_transfer_ref(ref,
+                                                         
np->xbdev->otherend_id,
+                                                         pfn);
+                       np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               /* Remove this page before passing
+                                * back to Xen. */
+                               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+                               MULTI_update_va_mapping(np->rx_mcl+i,
+                                                       (unsigned long)vaddr,
+                                                       __pte(0), 0);
+                       }
+                       nr_flips++;
+               } else {
+                       gnttab_grant_foreign_access_ref(ref,
+                                                       np->xbdev->otherend_id,
+                                                       pfn,
+                                                       0);
+               }
+
+               req->id = id;
+               req->gref = ref;
+       }
+
+       if ( nr_flips != 0 ) {
+               /* Tell the ballon driver what is going on. */
+               balloon_update_driver_allowance(i);
+
+               set_xen_guest_handle(reservation.extent_start,
+                                    np->rx_pfn_array);
+               reservation.nr_extents   = nr_flips;
+               reservation.extent_order = 0;
+               reservation.address_bits = 0;
+               reservation.domid        = DOMID_SELF;
 
                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-                       /* Remove this page before passing back to Xen. */
-                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
-                                           INVALID_P2M_ENTRY);
-                       MULTI_update_va_mapping(np->rx_mcl+i,
-                                               (unsigned long)skb->head,
-                                               __pte(0), 0);
+                       /* After all PTEs have been zapped, flush the TLB. */
+                       np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+                               UVMF_TLB_FLUSH|UVMF_ALL;
+
+                       /* Give away a batch of pages. */
+                       np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+                       np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+                       np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+                       /* Zap PTEs and give away pages in one big
+                        * multicall. */
+                       (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
+
+                       /* Check return status of HYPERVISOR_memory_op(). */
+                       if (unlikely(np->rx_mcl[i].result != i))
+                               panic("Unable to reduce memory reservation\n");
+               } else {
+                       if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                                &reservation) != i)
+                               panic("Unable to reduce memory reservation\n");
                }
-       }
-
-       /* Tell the ballon driver what is going on. */
-       balloon_update_driver_allowance(i);
-
-       set_xen_guest_handle(reservation.extent_start,
-                            (xen_pfn_t *)np->rx_pfn_array);
-       reservation.nr_extents   = i;
-       reservation.extent_order = 0;
-       reservation.address_bits = 0;
-       reservation.domid        = DOMID_SELF;
-
-       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-               /* After all PTEs have been zapped, flush the TLB. */
-               np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
-                       UVMF_TLB_FLUSH|UVMF_ALL;
-
-               /* Give away a batch of pages. */
-               np->rx_mcl[i].op = __HYPERVISOR_memory_op;
-               np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
-               np->rx_mcl[i].args[1] = (unsigned long)&reservation;
-
-               /* Zap PTEs and give away pages in one big multicall. */
-               (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
-
-               /* Check return status of HYPERVISOR_memory_op(). */
-               if (unlikely(np->rx_mcl[i].result != i))
-                       panic("Unable to reduce memory reservation\n");
-       } else
-               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-                                        &reservation) != i)
-                       panic("Unable to reduce memory reservation\n");
+       } else {
+               wmb();
+       }
 
        /* Above is a suitable barrier to ensure backend will see requests. */
        np->rx.req_prod_pvt = req_prod + i;
-       RING_PUSH_REQUESTS(&np->rx);
+ push:
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+       if (notify)
+               notify_remote_via_irq(np->irq);
 }
 
 static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
@@ -720,6 +865,7 @@ static int network_start_xmit(struct sk_
        unsigned short id;
        struct netfront_info *np = netdev_priv(dev);
        struct netif_tx_request *tx;
+       struct netif_extra_info *extra;
        char *data = skb->data;
        RING_IDX i;
        grant_ref_t ref;
@@ -740,7 +886,8 @@ static int network_start_xmit(struct sk_
        spin_lock_irq(&np->tx_lock);
 
        if (unlikely(!netif_carrier_ok(dev) ||
-                    (frags > 1 && !xennet_can_sg(dev)))) {
+                    (frags > 1 && !xennet_can_sg(dev)) ||
+                    netif_needs_gso(dev, skb))) {
                spin_unlock_irq(&np->tx_lock);
                goto drop;
        }
@@ -763,10 +910,35 @@ static int network_start_xmit(struct sk_
        tx->size = len;
 
        tx->flags = 0;
+       extra = NULL;
+
        if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
                tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+#ifdef CONFIG_XEN
        if (skb->proto_data_valid) /* remote but checksummed? */
                tx->flags |= NETTXF_data_validated;
+#endif
+
+#ifdef HAVE_GSO
+       if (skb_shinfo(skb)->gso_size) {
+               struct netif_extra_info *gso = (struct netif_extra_info *)
+                       RING_GET_REQUEST(&np->tx, ++i);
+
+               if (extra)
+                       extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+               else
+                       tx->flags |= NETTXF_extra_info;
+
+               gso->u.gso.size = skb_shinfo(skb)->gso_size;
+               gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+               gso->u.gso.pad = 0;
+               gso->u.gso.features = 0;
+
+               gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+               gso->flags = 0;
+               extra = gso;
+       }
+#endif
 
        np->tx.req_prod_pvt = i + 1;
 
@@ -802,191 +974,411 @@ static irqreturn_t netif_int(int irq, vo
        unsigned long flags;
 
        spin_lock_irqsave(&np->tx_lock, flags);
-       network_tx_buf_gc(dev);
+
+       if (likely(netif_carrier_ok(dev))) {
+               network_tx_buf_gc(dev);
+               /* Under tx_lock: protects access to rx shared-ring indexes. */
+               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+                       netif_rx_schedule(dev);
+       }
+
        spin_unlock_irqrestore(&np->tx_lock, flags);
 
-       if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx) &&
-           likely(netif_running(dev)))
-               netif_rx_schedule(dev);
-
        return IRQ_HANDLED;
 }
 
-
-static int netif_poll(struct net_device *dev, int *pbudget)
-{
-       struct netfront_info *np = netdev_priv(dev);
-       struct sk_buff *skb, *nskb;
-       struct netif_rx_response *rx;
-       RING_IDX i, rp;
-       struct mmu_update *mmu = np->rx_mmu;
-       struct multicall_entry *mcl = np->rx_mcl;
-       int work_done, budget, more_to_do = 1;
-       struct sk_buff_head rxq;
-       unsigned long flags;
-       unsigned long mfn;
-       grant_ref_t ref;
-
-       spin_lock(&np->rx_lock);
-
-       if (unlikely(!netif_carrier_ok(dev))) {
-               spin_unlock(&np->rx_lock);
-               return 0;
-       }
-
-       skb_queue_head_init(&rxq);
-
-       if ((budget = *pbudget) > dev->quota)
-               budget = dev->quota;
-       rp = np->rx.sring->rsp_prod;
-       rmb(); /* Ensure we see queued responses up to 'rp'. */
-
-       for (i = np->rx.rsp_cons, work_done = 0;
-            (i != rp) && (work_done < budget);
-            i++, work_done++) {
-               rx = RING_GET_RESPONSE(&np->rx, i);
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+                               grant_ref_t ref)
+{
+       int new = xennet_rxidx(np->rx.req_prod_pvt);
+
+       BUG_ON(np->rx_skbs[new]);
+       np->rx_skbs[new] = skb;
+       np->grant_rx_ref[new] = ref;
+       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+       np->rx.req_prod_pvt++;
+}
+
+int xennet_get_extras(struct netfront_info *np,
+                     struct netif_extra_info *extras, RING_IDX rp)
+
+{
+       struct netif_extra_info *extra;
+       RING_IDX cons = np->rx.rsp_cons;
+       int err = 0;
+
+       do {
+               struct sk_buff *skb;
+               grant_ref_t ref;
+
+               if (unlikely(cons + 1 == rp)) {
+                       if (net_ratelimit())
+                               WPRINTK("Missing extra info\n");
+                       err = -EBADR;
+                       break;
+               }
+
+               extra = (struct netif_extra_info *)
+                       RING_GET_RESPONSE(&np->rx, ++cons);
+
+               if (unlikely(!extra->type ||
+                            extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+                       if (net_ratelimit())
+                               WPRINTK("Invalid extra type: %d\n",
+                                       extra->type);
+                       err = -EINVAL;
+               } else {
+                       memcpy(&extras[extra->type - 1], extra,
+                              sizeof(*extra));
+               }
+
+               skb = xennet_get_rx_skb(np, cons);
+               ref = xennet_get_rx_ref(np, cons);
+               xennet_move_rx_slot(np, skb, ref);
+       } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+       np->rx.rsp_cons = cons;
+       return err;
+}
+
+static int xennet_get_responses(struct netfront_info *np,
+                               struct netfront_rx_info *rinfo, RING_IDX rp,
+                               struct sk_buff_head *list,
+                               int *pages_flipped_p)
+{
+       int pages_flipped = *pages_flipped_p;
+       struct mmu_update *mmu;
+       struct multicall_entry *mcl;
+       struct netif_rx_response *rx = &rinfo->rx;
+       struct netif_extra_info *extras = rinfo->extras;
+       RING_IDX cons = np->rx.rsp_cons;
+       struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+       grant_ref_t ref = xennet_get_rx_ref(np, cons);
+       int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+       int frags = 1;
+       int err = 0;
+       unsigned long ret;
+
+       if (rx->flags & NETRXF_extra_info) {
+               err = xennet_get_extras(np, extras, rp);
+               cons = np->rx.rsp_cons;
+       }
+
+       for (;;) {
+               unsigned long mfn;
+
+               if (unlikely(rx->status < 0 ||
+                            rx->offset + rx->status > PAGE_SIZE)) {
+                       if (net_ratelimit())
+                               WPRINTK("rx->offset: %x, size: %u\n",
+                                       rx->offset, rx->status);
+                       err = -EINVAL;
+                       goto next;
+               }
 
                /*
                 * This definitely indicates a bug, either in this driver or in
                 * the backend driver. In future this should flag the bad
                 * situation to the system controller to reboot the backed.
                 */
-               if ((ref = np->grant_rx_ref[rx->id]) == GRANT_INVALID_REF) {
+               if (ref == GRANT_INVALID_REF) {
                        WPRINTK("Bad rx response id %d.\n", rx->id);
+                       err = -EINVAL;
+                       goto next;
+               }
+
+               if (!np->copying_receiver) {
+                       /* Memory pressure, insufficient buffer
+                        * headroom, ... */
+                       if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
+                               if (net_ratelimit())
+                                       WPRINTK("Unfulfilled rx req "
+                                               "(id=%d, st=%d).\n",
+                                               rx->id, rx->status);
+                               xennet_move_rx_slot(np, skb, ref);
+                               err = -ENOMEM;
+                               goto next;
+                       }
+
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               /* Remap the page. */
+                               struct page *page =
+                                       skb_shinfo(skb)->frags[0].page;
+                               unsigned long pfn = page_to_pfn(page);
+                               void *vaddr = page_address(page);
+
+                               mcl = np->rx_mcl + pages_flipped;
+                               mmu = np->rx_mmu + pages_flipped;
+
+                               MULTI_update_va_mapping(mcl,
+                                                       (unsigned long)vaddr,
+                                                       pfn_pte_ma(mfn,
+                                                                  PAGE_KERNEL),
+                                                       0);
+                               mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                                       | MMU_MACHPHYS_UPDATE;
+                               mmu->val = pfn;
+
+                               set_phys_to_machine(pfn, mfn);
+                       }
+                       pages_flipped++;
+               } else {
+                       ret = gnttab_end_foreign_access_ref(ref, 0);
+                       BUG_ON(!ret);
+               }
+
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+               __skb_queue_tail(list, skb);
+
+next:
+               if (!(rx->flags & NETRXF_more_data))
+                       break;
+
+               if (cons + frags == rp) {
+                       if (net_ratelimit())
+                               WPRINTK("Need more frags\n");
+                       err = -ENOENT;
+                       break;
+               }
+
+               rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+               skb = xennet_get_rx_skb(np, cons + frags);
+               ref = xennet_get_rx_ref(np, cons + frags);
+               frags++;
+       }
+
+       if (unlikely(frags > max)) {
+               if (net_ratelimit())
+                       WPRINTK("Too many frags\n");
+               err = -E2BIG;
+       }
+
+       *pages_flipped_p = pages_flipped;
+
+       return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+                                 struct sk_buff *skb,
+                                 struct sk_buff_head *list)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       RING_IDX cons = np->rx.rsp_cons;
+       skb_frag_t *frag = shinfo->frags + nr_frags;
+       struct sk_buff *nskb;
+
+       while ((nskb = __skb_dequeue(list))) {
+               struct netif_rx_response *rx =
+                       RING_GET_RESPONSE(&np->rx, ++cons);
+
+               frag->page = skb_shinfo(nskb)->frags[0].page;
+               frag->page_offset = rx->offset;
+               frag->size = rx->status;
+
+               skb->data_len += rx->status;
+
+               skb_shinfo(nskb)->nr_frags = 0;
+               kfree_skb(nskb);
+
+               frag++;
+               nr_frags++;
+       }
+
+       shinfo->nr_frags = nr_frags;
+       return cons;
+}
+
+static int xennet_set_skb_gso(struct sk_buff *skb,
+                             struct netif_extra_info *gso)
+{
+       if (!gso->u.gso.size) {
+               if (net_ratelimit())
+                       WPRINTK("GSO size must not be zero.\n");
+               return -EINVAL;
+       }
+
+       /* Currently only TCPv4 S.O. is supported. */
+       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+               if (net_ratelimit())
+                       WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+               return -EINVAL;
+       }
+
+#ifdef HAVE_GSO
+       skb_shinfo(skb)->gso_size = gso->u.gso.size;
+       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+       /* Header must be checked, and gso_segs computed. */
+       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+       skb_shinfo(skb)->gso_segs = 0;
+
+       return 0;
+#else
+       if (net_ratelimit())
+               WPRINTK("GSO unsupported by this kernel.\n");
+       return -EINVAL;
+#endif
+}
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       struct sk_buff *skb;
+       struct netfront_rx_info rinfo;
+       struct netif_rx_response *rx = &rinfo.rx;
+       struct netif_extra_info *extras = rinfo.extras;
+       RING_IDX i, rp;
+       struct multicall_entry *mcl;
+       int work_done, budget, more_to_do = 1;
+       struct sk_buff_head rxq;
+       struct sk_buff_head errq;
+       struct sk_buff_head tmpq;
+       unsigned long flags;
+       unsigned int len;
+       int pages_flipped = 0;
+       int err;
+
+       spin_lock(&np->rx_lock);
+
+       if (unlikely(!netif_carrier_ok(dev))) {
+               spin_unlock(&np->rx_lock);
+               return 0;
+       }
+
+       skb_queue_head_init(&rxq);
+       skb_queue_head_init(&errq);
+       skb_queue_head_init(&tmpq);
+
+       if ((budget = *pbudget) > dev->quota)
+               budget = dev->quota;
+       rp = np->rx.sring->rsp_prod;
+       rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+       for (i = np->rx.rsp_cons, work_done = 0;
+            (i != rp) && (work_done < budget);
+            np->rx.rsp_cons = ++i, work_done++) {
+               memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+               memset(extras, 0, sizeof(extras));
+
+               err = xennet_get_responses(np, &rinfo, rp, &tmpq,
+                                          &pages_flipped);
+
+               if (unlikely(err)) {
+err:
+                       i = np->rx.rsp_cons + skb_queue_len(&tmpq) - 1;
                        work_done--;
+                       while ((skb = __skb_dequeue(&tmpq)))
+                               __skb_queue_tail(&errq, skb);
+                       np->stats.rx_errors++;
                        continue;
                }
 
-               /* Memory pressure, insufficient buffer headroom, ... */
-               if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
-                       if (net_ratelimit())
-                               WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
-                                       rx->id, rx->status);
-                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id =
-                               rx->id;
-                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref =
-                               ref;
-                       np->rx.req_prod_pvt++;
-                       RING_PUSH_REQUESTS(&np->rx);
-                       work_done--;
-                       continue;
+               skb = __skb_dequeue(&tmpq);
+
+               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+                       struct netif_extra_info *gso;
+                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+                       if (unlikely(xennet_set_skb_gso(skb, gso))) {
+                               __skb_queue_head(&tmpq, skb);
+                               goto err;
+                       }
                }
 
-               gnttab_release_grant_reference(&np->gref_rx_head, ref);
-               np->grant_rx_ref[rx->id] = GRANT_INVALID_REF;
-
-               skb = np->rx_skbs[rx->id];
-               add_id_to_freelist(np->rx_skbs, rx->id);
-
-               /* NB. We handle skb overflow later. */
-               skb->data = skb->head + rx->offset;
-               skb->len  = rx->status;
-               skb->tail = skb->data + skb->len;
+               skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
+               skb->h.raw = skb->nh.raw + rx->offset;
+
+               len = rx->status;
+               if (len > RX_COPY_THRESHOLD)
+                       len = RX_COPY_THRESHOLD;
+               skb_put(skb, len);
+
+               if (rx->status > len) {
+                       skb_shinfo(skb)->frags[0].page_offset =
+                               rx->offset + len;
+                       skb_shinfo(skb)->frags[0].size = rx->status - len;
+                       skb->data_len = rx->status - len;
+               } else {
+                       skb_shinfo(skb)->frags[0].page = NULL;
+                       skb_shinfo(skb)->nr_frags = 0;
+               }
+
+               i = xennet_fill_frags(np, skb, &tmpq);
+
+               /*
+                * Truesize must approximates the size of true data plus
+                * any supervisor overheads. Adding hypervisor overheads
+                * has been shown to significantly reduce achievable
+                * bandwidth with the default receive buffer size. It is
+                * therefore not wise to account for it here.
+                *
+                * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
+                * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
+                * add the size of the data pulled in xennet_fill_frags().
+                *
+                * We also adjust for any unused space in the main data
+                * area by subtracting (RX_COPY_THRESHOLD - len). This is
+                * especially important with drivers which split incoming
+                * packets into header and data, using only 66 bytes of
+                * the main data area (see the e1000 driver for example.)
+                * On such systems, without this last adjustement, our
+                * achievable receive throughout using the standard receive
+                * buffer size was cut by 25%(!!!).
+                */
+               skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+               skb->len += skb->data_len;
 
                /*
                 * Old backends do not assert data_validated but we
                 * can infer it from csum_blank so test both flags.
                 */
-               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) {
+               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank))
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
-                       skb->proto_data_valid = 1;
-               } else {
+               else
                        skb->ip_summed = CHECKSUM_NONE;
-                       skb->proto_data_valid = 0;
+#ifdef CONFIG_XEN
+               skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE);
+               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
+#endif
+               np->stats.rx_packets++;
+               np->stats.rx_bytes += skb->len;
+
+               __skb_queue_tail(&rxq, skb);
+       }
+
+       if (pages_flipped) {
+               /* Some pages are no longer absent... */
+               balloon_update_driver_allowance(-pages_flipped);
+
+               /* Do all the remapping work and M2P updates. */
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       mcl = np->rx_mcl + pages_flipped;
+                       mcl->op = __HYPERVISOR_mmu_update;
+                       mcl->args[0] = (unsigned long)np->rx_mmu;
+                       mcl->args[1] = pages_flipped;
+                       mcl->args[2] = 0;
+                       mcl->args[3] = DOMID_SELF;
+                       (void)HYPERVISOR_multicall(np->rx_mcl,
+                                                  pages_flipped + 1);
                }
-               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
-
-               np->stats.rx_packets++;
-               np->stats.rx_bytes += rx->status;
-
-               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-                       /* Remap the page. */
-                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
-                                               pfn_pte_ma(mfn, PAGE_KERNEL),
-                                               0);
-                       mcl++;
-                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
-                               | MMU_MACHPHYS_UPDATE;
-                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
-                       mmu++;
-
-                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
-                                           mfn);
-               }
-
-               __skb_queue_tail(&rxq, skb);
-       }
-
-       /* Some pages are no longer absent... */
-       balloon_update_driver_allowance(-work_done);
-
-       /* Do all the remapping work, and M2P updates, in one big hypercall. */
-       if (likely((mcl - np->rx_mcl) != 0)) {
-               mcl->op = __HYPERVISOR_mmu_update;
-               mcl->args[0] = (unsigned long)np->rx_mmu;
-               mcl->args[1] = mmu - np->rx_mmu;
-               mcl->args[2] = 0;
-               mcl->args[3] = DOMID_SELF;
-               mcl++;
-               (void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
-       }
+       }
+
+       while ((skb = __skb_dequeue(&errq)))
+               kfree_skb(skb);
 
        while ((skb = __skb_dequeue(&rxq)) != NULL) {
-               if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
-                       if (net_ratelimit())
-                               printk(KERN_INFO "Received packet too big for "
-                                      "MTU (%d > %d)\n",
-                                      skb->len - ETH_HLEN - 4, dev->mtu);
-                       skb->len  = 0;
-                       skb->tail = skb->data;
-                       init_skb_shinfo(skb);
-                       dev_kfree_skb(skb);
-                       continue;
-               }
-
-               /*
-                * Enough room in skbuff for the data we were passed? Also,
-                * Linux expects at least 16 bytes headroom in each rx buffer.
-                */
-               if (unlikely(skb->tail > skb->end) ||
-                   unlikely((skb->data - skb->head) < 16)) {
-                       if (net_ratelimit()) {
-                               if (skb->tail > skb->end)
-                                       printk(KERN_INFO "Received packet "
-                                              "is %zd bytes beyond tail.\n",
-                                              skb->tail - skb->end);
-                               else
-                                       printk(KERN_INFO "Received packet "
-                                              "is %zd bytes before head.\n",
-                                              16 - (skb->data - skb->head));
-                       }
-
-                       nskb = __dev_alloc_skb(skb->len + 2,
-                                              GFP_ATOMIC|__GFP_NOWARN);
-                       if (nskb != NULL) {
-                               skb_reserve(nskb, 2);
-                               skb_put(nskb, skb->len);
-                               memcpy(nskb->data, skb->data, skb->len);
-                               /* Copy any other fields we already set up. */
-                               nskb->dev = skb->dev;
-                               nskb->ip_summed = skb->ip_summed;
-                               nskb->proto_data_valid = skb->proto_data_valid;
-                               nskb->proto_csum_blank = skb->proto_csum_blank;
-                       }
-
-                       /* Reinitialise and then destroy the old skbuff. */
-                       skb->len  = 0;
-                       skb->tail = skb->data;
-                       init_skb_shinfo(skb);
-                       dev_kfree_skb(skb);
-
-                       /* Switch old for new, if we copied the buffer. */
-                       if ((skb = nskb) == NULL)
-                               continue;
-               }
-
-               /* Set the shinfo area, which is hidden behind the data. */
-               init_skb_shinfo(skb);
+               struct page *page = (struct page *)skb->nh.raw;
+               void *vaddr = page_address(page);
+
+               memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
+                      skb_headlen(skb));
+
+               if (page != skb_shinfo(skb)->frags[0].page)
+                       __free_page(page);
+
                /* Ethernet work: Delayed to here as it peeks the header. */
                skb->protocol = eth_type_trans(skb, dev);
 
@@ -994,8 +1386,6 @@ static int netif_poll(struct net_device 
                netif_receive_skb(skb);
                dev->last_rx = jiffies;
        }
-
-       np->rx.rsp_cons = i;
 
        /* If we get a callback with very few responses, reduce fill target. */
        /* NB. Note exponential increase, linear decrease. */
@@ -1024,74 +1414,12 @@ static int netif_poll(struct net_device 
        return more_to_do;
 }
 
-
-static int network_close(struct net_device *dev)
-{
-       struct netfront_info *np = netdev_priv(dev);
-       netif_stop_queue(np->netdev);
-       return 0;
-}
-
-
-static struct net_device_stats *network_get_stats(struct net_device *dev)
-{
-       struct netfront_info *np = netdev_priv(dev);
-       return &np->stats;
-}
-
-static int xennet_change_mtu(struct net_device *dev, int mtu)
-{
-       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
-
-       if (mtu > max)
-               return -EINVAL;
-       dev->mtu = mtu;
-       return 0;
-}
-
-static int xennet_set_sg(struct net_device *dev, u32 data)
-{
-       if (data) {
-               struct netfront_info *np = netdev_priv(dev);
-               int val;
-
-               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
-                                "%d", &val) < 0)
-                       val = 0;
-               if (!val)
-                       return -ENOSYS;
-       } else if (dev->mtu > ETH_DATA_LEN)
-               dev->mtu = ETH_DATA_LEN;
-
-       return ethtool_op_set_sg(dev, data);
-}
-
-static void xennet_set_features(struct net_device *dev)
-{
-       xennet_set_sg(dev, 1);
-}
-
-static void network_connect(struct net_device *dev)
-{
-       struct netfront_info *np = netdev_priv(dev);
-       int i, requeue_idx;
+static void netif_release_tx_bufs(struct netfront_info *np)
+{
        struct sk_buff *skb;
-
-       xennet_set_features(dev);
-
-       spin_lock_irq(&np->tx_lock);
-       spin_lock(&np->rx_lock);
-
-       /*
-         * Recovery procedure:
-        *  NB. Freelist index entries are always going to be less than
-        *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
-        *  greater than PAGE_OFFSET: we use this property to distinguish
-        *  them.
-         */
-
-       /* Step 1: Discard all pending TX packet fragments. */
-       for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) {
+       int i;
+
+       for (i = 1; i <= NET_TX_RING_SIZE; i++) {
                if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
                        continue;
 
@@ -1104,22 +1432,218 @@ static void network_connect(struct net_d
                add_id_to_freelist(np->tx_skbs, i);
                dev_kfree_skb_irq(skb);
        }
+}
+
+static void netif_release_rx_bufs(struct netfront_info *np)
+{
+       struct mmu_update      *mmu = np->rx_mmu;
+       struct multicall_entry *mcl = np->rx_mcl;
+       struct sk_buff_head free_list;
+       struct sk_buff *skb;
+       unsigned long mfn;
+       int xfer = 0, noxfer = 0, unused = 0;
+       int id, ref;
+
+       if (np->copying_receiver) {
+               printk("%s: fix me for copying receiver.\n", __FUNCTION__);
+               return;
+       }
+
+       skb_queue_head_init(&free_list);
+
+       spin_lock(&np->rx_lock);
+
+       for (id = 0; id < NET_RX_RING_SIZE; id++) {
+               if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+                       unused++;
+                       continue;
+               }
+
+               skb = np->rx_skbs[id];
+               mfn = gnttab_end_foreign_transfer_ref(ref);
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+               np->grant_rx_ref[id] = GRANT_INVALID_REF;
+               add_id_to_freelist(np->rx_skbs, id);
+
+               if (0 == mfn) {
+                       struct page *page = skb_shinfo(skb)->frags[0].page;
+                       balloon_release_driver_page(page);
+                       skb_shinfo(skb)->nr_frags = 0;
+                       dev_kfree_skb(skb);
+                       noxfer++;
+                       continue;
+               }
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Remap the page. */
+                       struct page *page = skb_shinfo(skb)->frags[0].page;
+                       unsigned long pfn = page_to_pfn(page);
+                       void *vaddr = page_address(page);
+
+                       MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+                                               pfn_pte_ma(mfn, PAGE_KERNEL),
+                                               0);
+                       mcl++;
+                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                               | MMU_MACHPHYS_UPDATE;
+                       mmu->val = pfn;
+                       mmu++;
+
+                       set_phys_to_machine(pfn, mfn);
+               }
+               __skb_queue_tail(&free_list, skb);
+               xfer++;
+       }
+
+       printk("%s: %d xfer, %d noxfer, %d unused\n",
+              __FUNCTION__, xfer, noxfer, unused);
+
+       if (xfer) {
+               /* Some pages are no longer absent... */
+               balloon_update_driver_allowance(-xfer);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Do all the remapping work and M2P updates. */
+                       mcl->op = __HYPERVISOR_mmu_update;
+                       mcl->args[0] = (unsigned long)np->rx_mmu;
+                       mcl->args[1] = mmu - np->rx_mmu;
+                       mcl->args[2] = 0;
+                       mcl->args[3] = DOMID_SELF;
+                       mcl++;
+                       HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+               }
+       }
+
+       while ((skb = __skb_dequeue(&free_list)) != NULL)
+               dev_kfree_skb(skb);
+
+       spin_unlock(&np->rx_lock);
+}
+
+static int network_close(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       netif_stop_queue(np->netdev);
+       return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       return &np->stats;
+}
+
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+       if (data) {
+               struct netfront_info *np = netdev_priv(dev);
+               int val;
+
+               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+                                "%d", &val) < 0)
+                       val = 0;
+               if (!val)
+                       return -ENOSYS;
+       } else if (dev->mtu > ETH_DATA_LEN)
+               dev->mtu = ETH_DATA_LEN;
+
+       return ethtool_op_set_sg(dev, data);
+}
+
+static int xennet_set_tso(struct net_device *dev, u32 data)
+{
+#ifdef HAVE_GSO
+       if (data) {
+               struct netfront_info *np = netdev_priv(dev);
+               int val;
+
+               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                                "feature-gso-tcpv4", "%d", &val) < 0)
+                       val = 0;
+               if (!val)
+                       return -ENOSYS;
+       }
+
+       return ethtool_op_set_tso(dev, data);
+#else
+       return -ENOSYS;
+#endif
+}
+
+static void xennet_set_features(struct net_device *dev)
+{
+       dev_disable_gso_features(dev);
+       xennet_set_sg(dev, 0);
+
+       /* We need checksum offload to enable scatter/gather and TSO. */
+       if (!(dev->features & NETIF_F_IP_CSUM))
+               return;
+
+       if (!xennet_set_sg(dev, 1))
+               xennet_set_tso(dev, 1);
+}
+
+static void network_connect(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       int i, requeue_idx;
+       struct sk_buff *skb;
+       grant_ref_t ref;
+       netif_rx_request_t *req;
+
+       xennet_set_features(dev);
+
+       spin_lock_irq(&np->tx_lock);
+       spin_lock(&np->rx_lock);
+
+       /*
+        * Recovery procedure:
+        *  NB. Freelist index entries are always going to be less than
+        *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
+        *  greater than PAGE_OFFSET: we use this property to distinguish
+        *  them.
+        */
+
+       /* Step 1: Discard all pending TX packet fragments. */
+       netif_release_tx_bufs(np);
 
        /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
-       for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) {
-               if ((unsigned long)np->rx_skbs[i] < PAGE_OFFSET)
+       for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+               if (!np->rx_skbs[i])
                        continue;
-               gnttab_grant_foreign_transfer_ref(
-                       np->grant_rx_ref[i], np->xbdev->otherend_id,
-                       __pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
-               RING_GET_REQUEST(&np->rx, requeue_idx)->gref =
-                       np->grant_rx_ref[i];
-               RING_GET_REQUEST(&np->rx, requeue_idx)->id = i;
+
+               skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+               ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+               req = RING_GET_REQUEST(&np->rx, requeue_idx);
+
+               if (!np->copying_receiver) {
+                       gnttab_grant_foreign_transfer_ref(
+                               ref, np->xbdev->otherend_id,
+                               page_to_pfn(skb_shinfo(skb)->frags->page));
+               } else {
+                       gnttab_grant_foreign_access_ref(
+                               ref, np->xbdev->otherend_id,
+                               page_to_pfn(skb_shinfo(skb)->frags->page),
+                               0);
+               }
+               req->gref = ref;
+               req->id   = requeue_idx;
+
                requeue_idx++;
        }
 
        np->rx.req_prod_pvt = requeue_idx;
-       RING_PUSH_REQUESTS(&np->rx);
 
        /*
         * Step 3: All public and private state should now be sane.  Get
@@ -1139,6 +1663,8 @@ static void netif_uninit(struct net_devi
 static void netif_uninit(struct net_device *dev)
 {
        struct netfront_info *np = netdev_priv(dev);
+       netif_release_tx_bufs(np);
+       netif_release_rx_bufs(np);
        gnttab_free_grant_references(np->gref_tx_head);
        gnttab_free_grant_references(np->gref_rx_head);
 }
@@ -1149,6 +1675,9 @@ static struct ethtool_ops network_ethtoo
        .set_tx_csum = ethtool_op_set_tx_csum,
        .get_sg = ethtool_op_get_sg,
        .set_sg = xennet_set_sg,
+       .get_tso = ethtool_op_get_tso,
+       .set_tso = xennet_set_tso,
+       .get_link = ethtool_op_get_link,
 };
 
 #ifdef CONFIG_SYSFS
@@ -1294,13 +1823,8 @@ static void network_set_multicast_list(s
 {
 }
 
-/** Create a network device.
- * @param handle device handle
- * @param val return parameter for created device
- * @return 0 on success, error code otherwise
- */
-static struct net_device * __devinit create_netdev(int handle,
-                                                  struct xenbus_device *dev)
+static struct net_device * __devinit
+create_netdev(int handle, int copying_receiver, struct xenbus_device *dev)
 {
        int i, err = 0;
        struct net_device *netdev = NULL;
@@ -1313,9 +1837,10 @@ static struct net_device * __devinit cre
                return ERR_PTR(-ENOMEM);
        }
 
-       np                = netdev_priv(netdev);
-       np->handle        = handle;
-       np->xbdev         = dev;
+       np                   = netdev_priv(netdev);
+       np->handle           = handle;
+       np->xbdev            = dev;
+       np->copying_receiver = copying_receiver;
 
        netif_carrier_off(netdev);
 
@@ -1337,8 +1862,8 @@ static struct net_device * __devinit cre
                np->grant_tx_ref[i] = GRANT_INVALID_REF;
        }
 
-       for (i = 0; i <= NET_RX_RING_SIZE; i++) {
-               np->rx_skbs[i] = (void *)((unsigned long) i+1);
+       for (i = 0; i < NET_RX_RING_SIZE; i++) {
+               np->rx_skbs[i] = NULL;
                np->grant_rx_ref[i] = GRANT_INVALID_REF;
        }
 
@@ -1372,27 +1897,9 @@ static struct net_device * __devinit cre
        SET_MODULE_OWNER(netdev);
        SET_NETDEV_DEV(netdev, &dev->dev);
 
-       err = register_netdev(netdev);
-       if (err) {
-               printk(KERN_WARNING "%s> register_netdev err=%d\n",
-                      __FUNCTION__, err);
-               goto exit_free_rx;
-       }
-
-       err = xennet_sysfs_addif(netdev);
-       if (err) {
-               /* This can be non-fatal: it only means no tuning parameters */
-               printk(KERN_WARNING "%s> add sysfs failed err=%d\n",
-                      __FUNCTION__, err);
-       }
-
        np->netdev = netdev;
-
        return netdev;
 
-
- exit_free_rx:
-       gnttab_free_grant_references(np->gref_rx_head);
  exit_free_tx:
        gnttab_free_grant_references(np->gref_tx_head);
  exit:
@@ -1431,10 +1938,9 @@ static void netfront_closing(struct xenb
 {
        struct netfront_info *info = dev->dev.driver_data;
 
-       DPRINTK("netfront_closing: %s removed\n", dev->nodename);
+       DPRINTK("%s\n", dev->nodename);
 
        close_netdev(info);
-
        xenbus_switch_state(dev, XenbusStateClosed);
 }
 
@@ -1451,6 +1957,26 @@ static int __devexit netfront_remove(str
        return 0;
 }
 
+
+static int open_netdev(struct netfront_info *info)
+{
+       int err;
+       
+       err = register_netdev(info->netdev);
+       if (err) {
+               printk(KERN_WARNING "%s: register_netdev err=%d\n",
+                      __FUNCTION__, err);
+               return err;
+       }
+
+       err = xennet_sysfs_addif(info->netdev);
+       if (err) {
+               /* This can be non-fatal: it only means no tuning parameters */
+               printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
+                      __FUNCTION__, err);
+       }
+       return 0;
+}
 
 static void close_netdev(struct netfront_info *info)
 {
@@ -1529,7 +2055,7 @@ static int __init netif_init(void)
        if (!is_running_on_xen())
                return -ENODEV;
 
-       if (xen_start_info->flags & SIF_INITDOMAIN)
+       if (is_initial_xendomain())
                return 0;
 
        IPRINTK("Initialising virtual ethernet driver.\n");

_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ppc-devel

<Prev in Thread] Current Thread [Next in Thread>
  • [XenPPC] [linux-ppc-2.6] [LINUX][XEN] backport net drivers, Xen patchbot-linux-ppc-2 . 6 <=